From 34d581f2dcffa9f54e96af230a56cb01e8e2fccd Mon Sep 17 00:00:00 2001 From: Yuri Kunde Schlesner Date: Fri, 16 Dec 2016 21:41:38 -0800 Subject: VideoCore/Shader: Extract input vertex loading code into function --- src/video_core/command_processor.cpp | 6 ++++-- src/video_core/shader/shader.cpp | 30 +++++++++++++----------------- src/video_core/shader/shader.h | 12 +++++++++--- 3 files changed, 26 insertions(+), 22 deletions(-) (limited to 'src/video_core') diff --git a/src/video_core/command_processor.cpp b/src/video_core/command_processor.cpp index ea58e9f54..36f72393b 100644 --- a/src/video_core/command_processor.cpp +++ b/src/video_core/command_processor.cpp @@ -149,7 +149,8 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { if (g_debug_context) g_debug_context->OnEvent(DebugContext::Event::VertexShaderInvocation, static_cast(&immediate_input)); - g_state.vs.Run(shader_unit, immediate_input, regs.vs.num_input_attributes + 1); + shader_unit.LoadInputVertex(immediate_input, regs.vs.num_input_attributes + 1); + g_state.vs.Run(shader_unit); Shader::OutputVertex output_vertex = shader_unit.output_registers.ToVertex(regs.vs); @@ -283,7 +284,8 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { if (g_debug_context) g_debug_context->OnEvent(DebugContext::Event::VertexShaderInvocation, (void*)&input); - g_state.vs.Run(shader_unit, input, loader.GetNumTotalAttributes()); + shader_unit.LoadInputVertex(input, loader.GetNumTotalAttributes()); + g_state.vs.Run(shader_unit); // Retrieve vertex from register data output_vertex = shader_unit.output_registers.ToVertex(regs.vs); diff --git a/src/video_core/shader/shader.cpp b/src/video_core/shader/shader.cpp index 7ae57e619..8dca9d0cb 100644 --- a/src/video_core/shader/shader.cpp +++ b/src/video_core/shader/shader.cpp @@ -76,6 +76,17 @@ OutputVertex OutputRegisters::ToVertex(const Regs::ShaderConfig& config) const { return ret; } +void UnitState::LoadInputVertex(const InputVertex& input, int num_attributes) { + // Setup input register table + const auto& attribute_register_map = g_state.regs.vs.input_register_map; + + for (int i = 0; i < num_attributes; i++) + registers.input[attribute_register_map.GetRegisterForAttribute(i)] = input.attr[i]; + + conditional_code[0] = false; + conditional_code[1] = false; +} + #ifdef ARCHITECTURE_x86_64 static std::unordered_map> shader_map; static const JitShader* jit_shader; @@ -109,21 +120,12 @@ void ShaderSetup::Setup() { MICROPROFILE_DEFINE(GPU_Shader, "GPU", "Shader", MP_RGB(50, 50, 240)); -void ShaderSetup::Run(UnitState& state, const InputVertex& input, int num_attributes) { +void ShaderSetup::Run(UnitState& state) { auto& config = g_state.regs.vs; auto& setup = g_state.vs; MICROPROFILE_SCOPE(GPU_Shader); - // Setup input register table - const auto& attribute_register_map = config.input_register_map; - - for (int i = 0; i < num_attributes; i++) - state.registers.input[attribute_register_map.GetRegisterForAttribute(i)] = input.attr[i]; - - state.conditional_code[0] = false; - state.conditional_code[1] = false; - #ifdef ARCHITECTURE_x86_64 if (VideoCore::g_shader_jit_enabled) { jit_shader->Run(setup, state, config.main_offset); @@ -145,13 +147,7 @@ DebugData ShaderSetup::ProduceDebugInfo(const InputVertex& input, int num_ // Setup input register table boost::fill(state.registers.input, Math::Vec4::AssignToAll(float24::Zero())); - const auto& attribute_register_map = config.input_register_map; - for (int i = 0; i < num_attributes; i++) - state.registers.input[attribute_register_map.GetRegisterForAttribute(i)] = input.attr[i]; - - state.conditional_code[0] = false; - state.conditional_code[1] = false; - + state.LoadInputVertex(input, num_attributes); RunInterpreter(setup, state, debug_data, config.main_offset); return debug_data; } diff --git a/src/video_core/shader/shader.h b/src/video_core/shader/shader.h index 2b07759b9..c5d23e0ea 100644 --- a/src/video_core/shader/shader.h +++ b/src/video_core/shader/shader.h @@ -142,6 +142,14 @@ struct UnitState { return 0; } } + + /** + * Loads the unit state with an input vertex. + * + * @param input Input vertex into the shader + * @param num_attributes The number of vertex shader attributes to load + */ + void LoadInputVertex(const InputVertex& input, int num_attributes); }; /// Clears the shader cache @@ -182,10 +190,8 @@ struct ShaderSetup { /** * Runs the currently setup shader * @param state Shader unit state, must be setup per shader and per shader unit - * @param input Input vertex into the shader - * @param num_attributes The number of vertex shader attributes */ - void Run(UnitState& state, const InputVertex& input, int num_attributes); + void Run(UnitState& state); /** * Produce debug information based on the given shader and input vertex -- cgit v1.2.3 From e3caf669b05bc0727053885ee7e6e5c78d655df4 Mon Sep 17 00:00:00 2001 From: Yuri Kunde Schlesner Date: Fri, 16 Dec 2016 21:48:36 -0800 Subject: VideoCore/Shader: Use self instead of g_state.vs in ShaderSetup --- src/video_core/shader/shader.cpp | 16 +++++++--------- src/video_core/shader/shader.h | 3 +-- 2 files changed, 8 insertions(+), 11 deletions(-) (limited to 'src/video_core') diff --git a/src/video_core/shader/shader.cpp b/src/video_core/shader/shader.cpp index 8dca9d0cb..868be1360 100644 --- a/src/video_core/shader/shader.cpp +++ b/src/video_core/shader/shader.cpp @@ -102,8 +102,8 @@ void ShaderSetup::Setup() { #ifdef ARCHITECTURE_x86_64 if (VideoCore::g_shader_jit_enabled) { u64 cache_key = - Common::ComputeHash64(&g_state.vs.program_code, sizeof(g_state.vs.program_code)) ^ - Common::ComputeHash64(&g_state.vs.swizzle_data, sizeof(g_state.vs.swizzle_data)); + Common::ComputeHash64(&program_code, sizeof(program_code)) ^ + Common::ComputeHash64(&swizzle_data, sizeof(swizzle_data)); auto iter = shader_map.find(cache_key); if (iter != shader_map.end()) { @@ -122,33 +122,31 @@ MICROPROFILE_DEFINE(GPU_Shader, "GPU", "Shader", MP_RGB(50, 50, 240)); void ShaderSetup::Run(UnitState& state) { auto& config = g_state.regs.vs; - auto& setup = g_state.vs; MICROPROFILE_SCOPE(GPU_Shader); #ifdef ARCHITECTURE_x86_64 if (VideoCore::g_shader_jit_enabled) { - jit_shader->Run(setup, state, config.main_offset); + jit_shader->Run(*this, state, config.main_offset); } else { DebugData dummy_debug_data; - RunInterpreter(setup, state, dummy_debug_data, config.main_offset); + RunInterpreter(*this, state, dummy_debug_data, config.main_offset); } #else DebugData dummy_debug_data; - RunInterpreter(setup, state, dummy_debug_data, config.main_offset); + RunInterpreter(*this, state, dummy_debug_data, config.main_offset); #endif // ARCHITECTURE_x86_64 } DebugData ShaderSetup::ProduceDebugInfo(const InputVertex& input, int num_attributes, - const Regs::ShaderConfig& config, - const ShaderSetup& setup) { + const Regs::ShaderConfig& config) { UnitState state; DebugData debug_data; // Setup input register table boost::fill(state.registers.input, Math::Vec4::AssignToAll(float24::Zero())); state.LoadInputVertex(input, num_attributes); - RunInterpreter(setup, state, debug_data, config.main_offset); + RunInterpreter(*this, state, debug_data, config.main_offset); return debug_data; } diff --git a/src/video_core/shader/shader.h b/src/video_core/shader/shader.h index c5d23e0ea..61becb6e5 100644 --- a/src/video_core/shader/shader.h +++ b/src/video_core/shader/shader.h @@ -198,11 +198,10 @@ struct ShaderSetup { * @param input Input vertex into the shader * @param num_attributes The number of vertex shader attributes * @param config Configuration object for the shader pipeline - * @param setup Setup object for the shader pipeline * @return Debug information for this shader with regards to the given vertex */ DebugData ProduceDebugInfo(const InputVertex& input, int num_attributes, - const Regs::ShaderConfig& config, const ShaderSetup& setup); + const Regs::ShaderConfig& config); }; } // namespace Shader -- cgit v1.2.3 From 1e1f9398176e4f1ec608f31f22a576c749a0a723 Mon Sep 17 00:00:00 2001 From: Yuri Kunde Schlesner Date: Fri, 16 Dec 2016 22:30:00 -0800 Subject: VideoCore/Shader: Use only entry_point as ShaderSetup param This removes all implicit dependency of ShaderState on global PICA state. --- src/video_core/command_processor.cpp | 4 ++-- src/video_core/shader/shader.cpp | 16 +++++++++------- src/video_core/shader/shader.h | 4 ++-- 3 files changed, 13 insertions(+), 11 deletions(-) (limited to 'src/video_core') diff --git a/src/video_core/command_processor.cpp b/src/video_core/command_processor.cpp index 36f72393b..fc224c6f2 100644 --- a/src/video_core/command_processor.cpp +++ b/src/video_core/command_processor.cpp @@ -150,7 +150,7 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { g_debug_context->OnEvent(DebugContext::Event::VertexShaderInvocation, static_cast(&immediate_input)); shader_unit.LoadInputVertex(immediate_input, regs.vs.num_input_attributes + 1); - g_state.vs.Run(shader_unit); + g_state.vs.Run(shader_unit, regs.vs.main_offset); Shader::OutputVertex output_vertex = shader_unit.output_registers.ToVertex(regs.vs); @@ -285,7 +285,7 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { g_debug_context->OnEvent(DebugContext::Event::VertexShaderInvocation, (void*)&input); shader_unit.LoadInputVertex(input, loader.GetNumTotalAttributes()); - g_state.vs.Run(shader_unit); + g_state.vs.Run(shader_unit, regs.vs.main_offset); // Retrieve vertex from register data output_vertex = shader_unit.output_registers.ToVertex(regs.vs); diff --git a/src/video_core/shader/shader.cpp b/src/video_core/shader/shader.cpp index 868be1360..936db0582 100644 --- a/src/video_core/shader/shader.cpp +++ b/src/video_core/shader/shader.cpp @@ -120,33 +120,35 @@ void ShaderSetup::Setup() { MICROPROFILE_DEFINE(GPU_Shader, "GPU", "Shader", MP_RGB(50, 50, 240)); -void ShaderSetup::Run(UnitState& state) { - auto& config = g_state.regs.vs; +void ShaderSetup::Run(UnitState& state, unsigned int entry_point) { + ASSERT(entry_point < 1024); MICROPROFILE_SCOPE(GPU_Shader); #ifdef ARCHITECTURE_x86_64 if (VideoCore::g_shader_jit_enabled) { - jit_shader->Run(*this, state, config.main_offset); + jit_shader->Run(*this, state, entry_point); } else { DebugData dummy_debug_data; - RunInterpreter(*this, state, dummy_debug_data, config.main_offset); + RunInterpreter(*this, state, dummy_debug_data, entry_point); } #else DebugData dummy_debug_data; - RunInterpreter(*this, state, dummy_debug_data, config.main_offset); + RunInterpreter(*this, state, dummy_debug_data, entry_point); #endif // ARCHITECTURE_x86_64 } DebugData ShaderSetup::ProduceDebugInfo(const InputVertex& input, int num_attributes, - const Regs::ShaderConfig& config) { + unsigned int entry_point) { + ASSERT(entry_point < 1024); + UnitState state; DebugData debug_data; // Setup input register table boost::fill(state.registers.input, Math::Vec4::AssignToAll(float24::Zero())); state.LoadInputVertex(input, num_attributes); - RunInterpreter(*this, state, debug_data, config.main_offset); + RunInterpreter(*this, state, debug_data, entry_point); return debug_data; } diff --git a/src/video_core/shader/shader.h b/src/video_core/shader/shader.h index 61becb6e5..d21f481ab 100644 --- a/src/video_core/shader/shader.h +++ b/src/video_core/shader/shader.h @@ -191,7 +191,7 @@ struct ShaderSetup { * Runs the currently setup shader * @param state Shader unit state, must be setup per shader and per shader unit */ - void Run(UnitState& state); + void Run(UnitState& state, unsigned int entry_point); /** * Produce debug information based on the given shader and input vertex @@ -201,7 +201,7 @@ struct ShaderSetup { * @return Debug information for this shader with regards to the given vertex */ DebugData ProduceDebugInfo(const InputVertex& input, int num_attributes, - const Regs::ShaderConfig& config); + unsigned int entry_point); }; } // namespace Shader -- cgit v1.2.3 From bd82cffd0bf5d61eab8c7c856bcc284b1a77e33b Mon Sep 17 00:00:00 2001 From: Yuri Kunde Schlesner Date: Fri, 16 Dec 2016 22:32:35 -0800 Subject: VideoCore/Shader: Add constness to methods --- src/video_core/shader/shader.cpp | 4 ++-- src/video_core/shader/shader.h | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'src/video_core') diff --git a/src/video_core/shader/shader.cpp b/src/video_core/shader/shader.cpp index 936db0582..ae696533f 100644 --- a/src/video_core/shader/shader.cpp +++ b/src/video_core/shader/shader.cpp @@ -120,7 +120,7 @@ void ShaderSetup::Setup() { MICROPROFILE_DEFINE(GPU_Shader, "GPU", "Shader", MP_RGB(50, 50, 240)); -void ShaderSetup::Run(UnitState& state, unsigned int entry_point) { +void ShaderSetup::Run(UnitState& state, unsigned int entry_point) const { ASSERT(entry_point < 1024); MICROPROFILE_SCOPE(GPU_Shader); @@ -139,7 +139,7 @@ void ShaderSetup::Run(UnitState& state, unsigned int entry_point) { } DebugData ShaderSetup::ProduceDebugInfo(const InputVertex& input, int num_attributes, - unsigned int entry_point) { + unsigned int entry_point) const { ASSERT(entry_point < 1024); UnitState state; diff --git a/src/video_core/shader/shader.h b/src/video_core/shader/shader.h index d21f481ab..44b9861e9 100644 --- a/src/video_core/shader/shader.h +++ b/src/video_core/shader/shader.h @@ -191,7 +191,7 @@ struct ShaderSetup { * Runs the currently setup shader * @param state Shader unit state, must be setup per shader and per shader unit */ - void Run(UnitState& state, unsigned int entry_point); + void Run(UnitState& state, unsigned int entry_point) const; /** * Produce debug information based on the given shader and input vertex @@ -201,7 +201,7 @@ struct ShaderSetup { * @return Debug information for this shader with regards to the given vertex */ DebugData ProduceDebugInfo(const InputVertex& input, int num_attributes, - unsigned int entry_point); + unsigned int entry_point) const; }; } // namespace Shader -- cgit v1.2.3 From dd4a1672a77830a53de61cf0554b34e9e17a2905 Mon Sep 17 00:00:00 2001 From: Yuri Kunde Schlesner Date: Fri, 16 Dec 2016 23:21:26 -0800 Subject: VideoCore/Shader: Split shader uniform state and shader engine Currently there's only a single dummy implementation, which will be split in a following commit. --- src/video_core/command_processor.cpp | 13 +++++---- src/video_core/shader/shader.cpp | 44 ++++++++++++++++++++++-------- src/video_core/shader/shader.h | 17 ++++++++---- src/video_core/shader/shader_interpreter.h | 1 + 4 files changed, 54 insertions(+), 21 deletions(-) (limited to 'src/video_core') diff --git a/src/video_core/command_processor.cpp b/src/video_core/command_processor.cpp index fc224c6f2..694c9f169 100644 --- a/src/video_core/command_processor.cpp +++ b/src/video_core/command_processor.cpp @@ -142,15 +142,16 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { MICROPROFILE_SCOPE(GPU_Drawing); immediate_attribute_id = 0; - Shader::UnitState shader_unit; - g_state.vs.Setup(); + auto* shader_engine = Shader::GetEngine(); + shader_engine->SetupBatch(&g_state.vs); // Send to vertex shader if (g_debug_context) g_debug_context->OnEvent(DebugContext::Event::VertexShaderInvocation, static_cast(&immediate_input)); + Shader::UnitState shader_unit; shader_unit.LoadInputVertex(immediate_input, regs.vs.num_input_attributes + 1); - g_state.vs.Run(shader_unit, regs.vs.main_offset); + shader_engine->Run(shader_unit, regs.vs.main_offset); Shader::OutputVertex output_vertex = shader_unit.output_registers.ToVertex(regs.vs); @@ -244,8 +245,10 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { unsigned int vertex_cache_pos = 0; vertex_cache_ids.fill(-1); + auto* shader_engine = Shader::GetEngine(); Shader::UnitState shader_unit; - g_state.vs.Setup(); + + shader_engine->SetupBatch(&g_state.vs); for (unsigned int index = 0; index < regs.num_vertices; ++index) { // Indexed rendering doesn't use the start offset @@ -285,7 +288,7 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { g_debug_context->OnEvent(DebugContext::Event::VertexShaderInvocation, (void*)&input); shader_unit.LoadInputVertex(input, loader.GetNumTotalAttributes()); - g_state.vs.Run(shader_unit, regs.vs.main_offset); + shader_engine->Run(shader_unit, regs.vs.main_offset); // Retrieve vertex from register data output_vertex = shader_unit.output_registers.ToVertex(regs.vs); diff --git a/src/video_core/shader/shader.cpp b/src/video_core/shader/shader.cpp index ae696533f..d276a1221 100644 --- a/src/video_core/shader/shader.cpp +++ b/src/video_core/shader/shader.cpp @@ -87,6 +87,17 @@ void UnitState::LoadInputVertex(const InputVertex& input, int num_attributes) { conditional_code[1] = false; } +class MergedShaderEngine : public ShaderEngine { +public: + void SetupBatch(const ShaderSetup* setup) override; + void Run(UnitState& state, unsigned int entry_point) const override; + DebugData ProduceDebugInfo(const InputVertex& input, int num_attributes, + unsigned int entry_point) const override; + +private: + const ShaderSetup* setup = nullptr; +}; + #ifdef ARCHITECTURE_x86_64 static std::unordered_map> shader_map; static const JitShader* jit_shader; @@ -98,13 +109,17 @@ void ClearCache() { #endif // ARCHITECTURE_x86_64 } -void ShaderSetup::Setup() { +void MergedShaderEngine::SetupBatch(const ShaderSetup* setup_) { + setup = setup_; + if (setup == nullptr) + return; + #ifdef ARCHITECTURE_x86_64 if (VideoCore::g_shader_jit_enabled) { - u64 cache_key = - Common::ComputeHash64(&program_code, sizeof(program_code)) ^ - Common::ComputeHash64(&swizzle_data, sizeof(swizzle_data)); + u64 code_hash = Common::ComputeHash64(&setup->program_code, sizeof(setup->program_code)); + u64 swizzle_hash = Common::ComputeHash64(&setup->swizzle_data, sizeof(setup->swizzle_data)); + u64 cache_key = code_hash ^ swizzle_hash; auto iter = shader_map.find(cache_key); if (iter != shader_map.end()) { jit_shader = iter->second.get(); @@ -120,26 +135,28 @@ void ShaderSetup::Setup() { MICROPROFILE_DEFINE(GPU_Shader, "GPU", "Shader", MP_RGB(50, 50, 240)); -void ShaderSetup::Run(UnitState& state, unsigned int entry_point) const { +void MergedShaderEngine::Run(UnitState& state, unsigned int entry_point) const { + ASSERT(setup != nullptr); ASSERT(entry_point < 1024); MICROPROFILE_SCOPE(GPU_Shader); #ifdef ARCHITECTURE_x86_64 if (VideoCore::g_shader_jit_enabled) { - jit_shader->Run(*this, state, entry_point); + jit_shader->Run(*setup, state, entry_point); } else { DebugData dummy_debug_data; - RunInterpreter(*this, state, dummy_debug_data, entry_point); + RunInterpreter(*setup, state, dummy_debug_data, entry_point); } #else DebugData dummy_debug_data; - RunInterpreter(*this, state, dummy_debug_data, entry_point); + RunInterpreter(*setup, state, dummy_debug_data, entry_point); #endif // ARCHITECTURE_x86_64 } -DebugData ShaderSetup::ProduceDebugInfo(const InputVertex& input, int num_attributes, - unsigned int entry_point) const { +DebugData MergedShaderEngine::ProduceDebugInfo(const InputVertex& input, int num_attributes, + unsigned int entry_point) const { + ASSERT(setup != nullptr); ASSERT(entry_point < 1024); UnitState state; @@ -148,10 +165,15 @@ DebugData ShaderSetup::ProduceDebugInfo(const InputVertex& input, int num_ // Setup input register table boost::fill(state.registers.input, Math::Vec4::AssignToAll(float24::Zero())); state.LoadInputVertex(input, num_attributes); - RunInterpreter(*this, state, debug_data, entry_point); + RunInterpreter(*setup, state, debug_data, entry_point); return debug_data; } +ShaderEngine* GetEngine() { + static MergedShaderEngine merged_engine; + return &merged_engine; +} + } // namespace Shader } // namespace Pica diff --git a/src/video_core/shader/shader.h b/src/video_core/shader/shader.h index 44b9861e9..899fb2607 100644 --- a/src/video_core/shader/shader.h +++ b/src/video_core/shader/shader.h @@ -156,7 +156,6 @@ struct UnitState { void ClearCache(); struct ShaderSetup { - struct { // The float uniforms are accessed by the shader JIT using SSE instructions, and are // therefore required to be 16-byte aligned. @@ -180,18 +179,23 @@ struct ShaderSetup { std::array program_code; std::array swizzle_data; +}; + +class ShaderEngine { +public: + virtual ~ShaderEngine() = default; /** * Performs any shader unit setup that only needs to happen once per shader (as opposed to once * per vertex, which would happen within the `Run` function). */ - void Setup(); + virtual void SetupBatch(const ShaderSetup* setup) = 0; /** * Runs the currently setup shader * @param state Shader unit state, must be setup per shader and per shader unit */ - void Run(UnitState& state, unsigned int entry_point) const; + virtual void Run(UnitState& state, unsigned int entry_point) const = 0; /** * Produce debug information based on the given shader and input vertex @@ -200,10 +204,13 @@ struct ShaderSetup { * @param config Configuration object for the shader pipeline * @return Debug information for this shader with regards to the given vertex */ - DebugData ProduceDebugInfo(const InputVertex& input, int num_attributes, - unsigned int entry_point) const; + virtual DebugData ProduceDebugInfo(const InputVertex& input, int num_attributes, + unsigned int entry_point) const = 0; }; +// TODO(yuriks): Remove and make it non-global state somewhere +ShaderEngine* GetEngine(); + } // namespace Shader } // namespace Pica diff --git a/src/video_core/shader/shader_interpreter.h b/src/video_core/shader/shader_interpreter.h index d31dcd7a6..3237b50b3 100644 --- a/src/video_core/shader/shader_interpreter.h +++ b/src/video_core/shader/shader_interpreter.h @@ -8,6 +8,7 @@ namespace Pica { namespace Shader { +struct ShaderSetup; struct UnitState; template -- cgit v1.2.3 From 8eefc62833bc8c3052c23f4f0d01d8b60a01925c Mon Sep 17 00:00:00 2001 From: Yuri Kunde Schlesner Date: Sat, 17 Dec 2016 00:06:23 -0800 Subject: VideoCore/Shader: Rename shader_jit_x64{ => _compiler}.{cpp,h} --- src/video_core/CMakeLists.txt | 4 +- src/video_core/shader/shader.cpp | 2 +- src/video_core/shader/shader_jit_x64.cpp | 888 ---------------------- src/video_core/shader/shader_jit_x64.h | 121 --- src/video_core/shader/shader_jit_x64_compiler.cpp | 888 ++++++++++++++++++++++ src/video_core/shader/shader_jit_x64_compiler.h | 121 +++ 6 files changed, 1012 insertions(+), 1012 deletions(-) delete mode 100644 src/video_core/shader/shader_jit_x64.cpp delete mode 100644 src/video_core/shader/shader_jit_x64.h create mode 100644 src/video_core/shader/shader_jit_x64_compiler.cpp create mode 100644 src/video_core/shader/shader_jit_x64_compiler.h (limited to 'src/video_core') diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt index 6ca319b59..36397cce9 100644 --- a/src/video_core/CMakeLists.txt +++ b/src/video_core/CMakeLists.txt @@ -50,10 +50,10 @@ set(HEADERS if(ARCHITECTURE_x86_64) set(SRCS ${SRCS} - shader/shader_jit_x64.cpp) + shader/shader_jit_x64_compiler.cpp) set(HEADERS ${HEADERS} - shader/shader_jit_x64.h) + shader/shader_jit_x64_compiler.h) endif() create_directory_groups(${SRCS} ${HEADERS}) diff --git a/src/video_core/shader/shader.cpp b/src/video_core/shader/shader.cpp index d276a1221..97c6519d6 100644 --- a/src/video_core/shader/shader.cpp +++ b/src/video_core/shader/shader.cpp @@ -17,7 +17,7 @@ #include "video_core/shader/shader.h" #include "video_core/shader/shader_interpreter.h" #ifdef ARCHITECTURE_x86_64 -#include "video_core/shader/shader_jit_x64.h" +#include "video_core/shader/shader_jit_x64_compiler.h" #endif // ARCHITECTURE_x86_64 #include "video_core/video_core.h" diff --git a/src/video_core/shader/shader_jit_x64.cpp b/src/video_core/shader/shader_jit_x64.cpp deleted file mode 100644 index c588b778b..000000000 --- a/src/video_core/shader/shader_jit_x64.cpp +++ /dev/null @@ -1,888 +0,0 @@ -// Copyright 2015 Citra Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#include -#include -#include -#include -#include -#include -#include "common/assert.h" -#include "common/logging/log.h" -#include "common/vector_math.h" -#include "common/x64/cpu_detect.h" -#include "common/x64/xbyak_abi.h" -#include "common/x64/xbyak_util.h" -#include "video_core/pica_state.h" -#include "video_core/pica_types.h" -#include "video_core/shader/shader.h" -#include "video_core/shader/shader_jit_x64.h" - -using namespace Common::X64; -using namespace Xbyak::util; -using Xbyak::Label; -using Xbyak::Reg32; -using Xbyak::Reg64; -using Xbyak::Xmm; - -namespace Pica { - -namespace Shader { - -typedef void (JitShader::*JitFunction)(Instruction instr); - -const JitFunction instr_table[64] = { - &JitShader::Compile_ADD, // add - &JitShader::Compile_DP3, // dp3 - &JitShader::Compile_DP4, // dp4 - &JitShader::Compile_DPH, // dph - nullptr, // unknown - &JitShader::Compile_EX2, // ex2 - &JitShader::Compile_LG2, // lg2 - nullptr, // unknown - &JitShader::Compile_MUL, // mul - &JitShader::Compile_SGE, // sge - &JitShader::Compile_SLT, // slt - &JitShader::Compile_FLR, // flr - &JitShader::Compile_MAX, // max - &JitShader::Compile_MIN, // min - &JitShader::Compile_RCP, // rcp - &JitShader::Compile_RSQ, // rsq - nullptr, // unknown - nullptr, // unknown - &JitShader::Compile_MOVA, // mova - &JitShader::Compile_MOV, // mov - nullptr, // unknown - nullptr, // unknown - nullptr, // unknown - nullptr, // unknown - &JitShader::Compile_DPH, // dphi - nullptr, // unknown - &JitShader::Compile_SGE, // sgei - &JitShader::Compile_SLT, // slti - nullptr, // unknown - nullptr, // unknown - nullptr, // unknown - nullptr, // unknown - nullptr, // unknown - &JitShader::Compile_NOP, // nop - &JitShader::Compile_END, // end - nullptr, // break - &JitShader::Compile_CALL, // call - &JitShader::Compile_CALLC, // callc - &JitShader::Compile_CALLU, // callu - &JitShader::Compile_IF, // ifu - &JitShader::Compile_IF, // ifc - &JitShader::Compile_LOOP, // loop - nullptr, // emit - nullptr, // sete - &JitShader::Compile_JMP, // jmpc - &JitShader::Compile_JMP, // jmpu - &JitShader::Compile_CMP, // cmp - &JitShader::Compile_CMP, // cmp - &JitShader::Compile_MAD, // madi - &JitShader::Compile_MAD, // madi - &JitShader::Compile_MAD, // madi - &JitShader::Compile_MAD, // madi - &JitShader::Compile_MAD, // madi - &JitShader::Compile_MAD, // madi - &JitShader::Compile_MAD, // madi - &JitShader::Compile_MAD, // madi - &JitShader::Compile_MAD, // mad - &JitShader::Compile_MAD, // mad - &JitShader::Compile_MAD, // mad - &JitShader::Compile_MAD, // mad - &JitShader::Compile_MAD, // mad - &JitShader::Compile_MAD, // mad - &JitShader::Compile_MAD, // mad - &JitShader::Compile_MAD, // mad -}; - -// The following is used to alias some commonly used registers. Generally, RAX-RDX and XMM0-XMM3 can -// be used as scratch registers within a compiler function. The other registers have designated -// purposes, as documented below: - -/// Pointer to the uniform memory -static const Reg64 SETUP = r9; -/// The two 32-bit VS address offset registers set by the MOVA instruction -static const Reg64 ADDROFFS_REG_0 = r10; -static const Reg64 ADDROFFS_REG_1 = r11; -/// VS loop count register (Multiplied by 16) -static const Reg32 LOOPCOUNT_REG = r12d; -/// Current VS loop iteration number (we could probably use LOOPCOUNT_REG, but this quicker) -static const Reg32 LOOPCOUNT = esi; -/// Number to increment LOOPCOUNT_REG by on each loop iteration (Multiplied by 16) -static const Reg32 LOOPINC = edi; -/// Result of the previous CMP instruction for the X-component comparison -static const Reg64 COND0 = r13; -/// Result of the previous CMP instruction for the Y-component comparison -static const Reg64 COND1 = r14; -/// Pointer to the UnitState instance for the current VS unit -static const Reg64 STATE = r15; -/// SIMD scratch register -static const Xmm SCRATCH = xmm0; -/// Loaded with the first swizzled source register, otherwise can be used as a scratch register -static const Xmm SRC1 = xmm1; -/// Loaded with the second swizzled source register, otherwise can be used as a scratch register -static const Xmm SRC2 = xmm2; -/// Loaded with the third swizzled source register, otherwise can be used as a scratch register -static const Xmm SRC3 = xmm3; -/// Additional scratch register -static const Xmm SCRATCH2 = xmm4; -/// Constant vector of [1.0f, 1.0f, 1.0f, 1.0f], used to efficiently set a vector to one -static const Xmm ONE = xmm14; -/// Constant vector of [-0.f, -0.f, -0.f, -0.f], used to efficiently negate a vector with XOR -static const Xmm NEGBIT = xmm15; - -// State registers that must not be modified by external functions calls -// Scratch registers, e.g., SRC1 and SCRATCH, have to be saved on the side if needed -static const BitSet32 persistent_regs = BuildRegSet({ - // Pointers to register blocks - SETUP, STATE, - // Cached registers - ADDROFFS_REG_0, ADDROFFS_REG_1, LOOPCOUNT_REG, COND0, COND1, - // Constants - ONE, NEGBIT, -}); - -/// Raw constant for the source register selector that indicates no swizzling is performed -static const u8 NO_SRC_REG_SWIZZLE = 0x1b; -/// Raw constant for the destination register enable mask that indicates all components are enabled -static const u8 NO_DEST_REG_MASK = 0xf; - -/** - * Get the vertex shader instruction for a given offset in the current shader program - * @param offset Offset in the current shader program of the instruction - * @return Instruction at the specified offset - */ -static Instruction GetVertexShaderInstruction(size_t offset) { - return {g_state.vs.program_code[offset]}; -} - -static void LogCritical(const char* msg) { - LOG_CRITICAL(HW_GPU, "%s", msg); -} - -void JitShader::Compile_Assert(bool condition, const char* msg) { - if (!condition) { - mov(ABI_PARAM1, reinterpret_cast(msg)); - CallFarFunction(*this, LogCritical); - } -} - -/** - * Loads and swizzles a source register into the specified XMM register. - * @param instr VS instruction, used for determining how to load the source register - * @param src_num Number indicating which source register to load (1 = src1, 2 = src2, 3 = src3) - * @param src_reg SourceRegister object corresponding to the source register to load - * @param dest Destination XMM register to store the loaded, swizzled source register - */ -void JitShader::Compile_SwizzleSrc(Instruction instr, unsigned src_num, SourceRegister src_reg, - Xmm dest) { - Reg64 src_ptr; - size_t src_offset; - - if (src_reg.GetRegisterType() == RegisterType::FloatUniform) { - src_ptr = SETUP; - src_offset = ShaderSetup::GetFloatUniformOffset(src_reg.GetIndex()); - } else { - src_ptr = STATE; - src_offset = UnitState::InputOffset(src_reg); - } - - int src_offset_disp = (int)src_offset; - ASSERT_MSG(src_offset == src_offset_disp, "Source register offset too large for int type"); - - unsigned operand_desc_id; - - const bool is_inverted = - (0 != (instr.opcode.Value().GetInfo().subtype & OpCode::Info::SrcInversed)); - - unsigned address_register_index; - unsigned offset_src; - - if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MAD || - instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MADI) { - operand_desc_id = instr.mad.operand_desc_id; - offset_src = is_inverted ? 3 : 2; - address_register_index = instr.mad.address_register_index; - } else { - operand_desc_id = instr.common.operand_desc_id; - offset_src = is_inverted ? 2 : 1; - address_register_index = instr.common.address_register_index; - } - - if (src_num == offset_src && address_register_index != 0) { - switch (address_register_index) { - case 1: // address offset 1 - movaps(dest, xword[src_ptr + ADDROFFS_REG_0 + src_offset_disp]); - break; - case 2: // address offset 2 - movaps(dest, xword[src_ptr + ADDROFFS_REG_1 + src_offset_disp]); - break; - case 3: // address offset 3 - movaps(dest, xword[src_ptr + LOOPCOUNT_REG.cvt64() + src_offset_disp]); - break; - default: - UNREACHABLE(); - break; - } - } else { - // Load the source - movaps(dest, xword[src_ptr + src_offset_disp]); - } - - SwizzlePattern swiz = {g_state.vs.swizzle_data[operand_desc_id]}; - - // Generate instructions for source register swizzling as needed - u8 sel = swiz.GetRawSelector(src_num); - if (sel != NO_SRC_REG_SWIZZLE) { - // Selector component order needs to be reversed for the SHUFPS instruction - sel = ((sel & 0xc0) >> 6) | ((sel & 3) << 6) | ((sel & 0xc) << 2) | ((sel & 0x30) >> 2); - - // Shuffle inputs for swizzle - shufps(dest, dest, sel); - } - - // If the source register should be negated, flip the negative bit using XOR - const bool negate[] = {swiz.negate_src1, swiz.negate_src2, swiz.negate_src3}; - if (negate[src_num - 1]) { - xorps(dest, NEGBIT); - } -} - -void JitShader::Compile_DestEnable(Instruction instr, Xmm src) { - DestRegister dest; - unsigned operand_desc_id; - if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MAD || - instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MADI) { - operand_desc_id = instr.mad.operand_desc_id; - dest = instr.mad.dest.Value(); - } else { - operand_desc_id = instr.common.operand_desc_id; - dest = instr.common.dest.Value(); - } - - SwizzlePattern swiz = {g_state.vs.swizzle_data[operand_desc_id]}; - - size_t dest_offset_disp = UnitState::OutputOffset(dest); - - // If all components are enabled, write the result to the destination register - if (swiz.dest_mask == NO_DEST_REG_MASK) { - // Store dest back to memory - movaps(xword[STATE + dest_offset_disp], src); - - } else { - // Not all components are enabled, so mask the result when storing to the destination - // register... - movaps(SCRATCH, xword[STATE + dest_offset_disp]); - - if (Common::GetCPUCaps().sse4_1) { - u8 mask = ((swiz.dest_mask & 1) << 3) | ((swiz.dest_mask & 8) >> 3) | - ((swiz.dest_mask & 2) << 1) | ((swiz.dest_mask & 4) >> 1); - blendps(SCRATCH, src, mask); - } else { - movaps(SCRATCH2, src); - unpckhps(SCRATCH2, SCRATCH); // Unpack X/Y components of source and destination - unpcklps(SCRATCH, src); // Unpack Z/W components of source and destination - - // Compute selector to selectively copy source components to destination for SHUFPS - // instruction - u8 sel = ((swiz.DestComponentEnabled(0) ? 1 : 0) << 0) | - ((swiz.DestComponentEnabled(1) ? 3 : 2) << 2) | - ((swiz.DestComponentEnabled(2) ? 0 : 1) << 4) | - ((swiz.DestComponentEnabled(3) ? 2 : 3) << 6); - shufps(SCRATCH, SCRATCH2, sel); - } - - // Store dest back to memory - movaps(xword[STATE + dest_offset_disp], SCRATCH); - } -} - -void JitShader::Compile_SanitizedMul(Xmm src1, Xmm src2, Xmm scratch) { - movaps(scratch, src1); - cmpordps(scratch, src2); - - mulps(src1, src2); - - movaps(src2, src1); - cmpunordps(src2, src2); - - xorps(scratch, src2); - andps(src1, scratch); -} - -void JitShader::Compile_EvaluateCondition(Instruction instr) { - // Note: NXOR is used below to check for equality - switch (instr.flow_control.op) { - case Instruction::FlowControlType::Or: - mov(eax, COND0); - mov(ebx, COND1); - xor(eax, (instr.flow_control.refx.Value() ^ 1)); - xor(ebx, (instr.flow_control.refy.Value() ^ 1)); - or (eax, ebx); - break; - - case Instruction::FlowControlType::And: - mov(eax, COND0); - mov(ebx, COND1); - xor(eax, (instr.flow_control.refx.Value() ^ 1)); - xor(ebx, (instr.flow_control.refy.Value() ^ 1)); - and(eax, ebx); - break; - - case Instruction::FlowControlType::JustX: - mov(eax, COND0); - xor(eax, (instr.flow_control.refx.Value() ^ 1)); - break; - - case Instruction::FlowControlType::JustY: - mov(eax, COND1); - xor(eax, (instr.flow_control.refy.Value() ^ 1)); - break; - } -} - -void JitShader::Compile_UniformCondition(Instruction instr) { - size_t offset = ShaderSetup::GetBoolUniformOffset(instr.flow_control.bool_uniform_id); - cmp(byte[SETUP + offset], 0); -} - -BitSet32 JitShader::PersistentCallerSavedRegs() { - return persistent_regs & ABI_ALL_CALLER_SAVED; -} - -void JitShader::Compile_ADD(Instruction instr) { - Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); - Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); - addps(SRC1, SRC2); - Compile_DestEnable(instr, SRC1); -} - -void JitShader::Compile_DP3(Instruction instr) { - Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); - Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); - - Compile_SanitizedMul(SRC1, SRC2, SCRATCH); - - movaps(SRC2, SRC1); - shufps(SRC2, SRC2, _MM_SHUFFLE(1, 1, 1, 1)); - - movaps(SRC3, SRC1); - shufps(SRC3, SRC3, _MM_SHUFFLE(2, 2, 2, 2)); - - shufps(SRC1, SRC1, _MM_SHUFFLE(0, 0, 0, 0)); - addps(SRC1, SRC2); - addps(SRC1, SRC3); - - Compile_DestEnable(instr, SRC1); -} - -void JitShader::Compile_DP4(Instruction instr) { - Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); - Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); - - Compile_SanitizedMul(SRC1, SRC2, SCRATCH); - - movaps(SRC2, SRC1); - shufps(SRC1, SRC1, _MM_SHUFFLE(2, 3, 0, 1)); // XYZW -> ZWXY - addps(SRC1, SRC2); - - movaps(SRC2, SRC1); - shufps(SRC1, SRC1, _MM_SHUFFLE(0, 1, 2, 3)); // XYZW -> WZYX - addps(SRC1, SRC2); - - Compile_DestEnable(instr, SRC1); -} - -void JitShader::Compile_DPH(Instruction instr) { - if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::DPHI) { - Compile_SwizzleSrc(instr, 1, instr.common.src1i, SRC1); - Compile_SwizzleSrc(instr, 2, instr.common.src2i, SRC2); - } else { - Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); - Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); - } - - if (Common::GetCPUCaps().sse4_1) { - // Set 4th component to 1.0 - blendps(SRC1, ONE, 0b1000); - } else { - // Set 4th component to 1.0 - movaps(SCRATCH, SRC1); - unpckhps(SCRATCH, ONE); // XYZW, 1111 -> Z1__ - unpcklpd(SRC1, SCRATCH); // XYZW, Z1__ -> XYZ1 - } - - Compile_SanitizedMul(SRC1, SRC2, SCRATCH); - - movaps(SRC2, SRC1); - shufps(SRC1, SRC1, _MM_SHUFFLE(2, 3, 0, 1)); // XYZW -> ZWXY - addps(SRC1, SRC2); - - movaps(SRC2, SRC1); - shufps(SRC1, SRC1, _MM_SHUFFLE(0, 1, 2, 3)); // XYZW -> WZYX - addps(SRC1, SRC2); - - Compile_DestEnable(instr, SRC1); -} - -void JitShader::Compile_EX2(Instruction instr) { - Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); - movss(xmm0, SRC1); // ABI_PARAM1 - - ABI_PushRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0); - CallFarFunction(*this, exp2f); - ABI_PopRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0); - - shufps(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); // ABI_RETURN - movaps(SRC1, xmm0); - Compile_DestEnable(instr, SRC1); -} - -void JitShader::Compile_LG2(Instruction instr) { - Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); - movss(xmm0, SRC1); // ABI_PARAM1 - - ABI_PushRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0); - CallFarFunction(*this, log2f); - ABI_PopRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0); - - shufps(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); // ABI_RETURN - movaps(SRC1, xmm0); - Compile_DestEnable(instr, SRC1); -} - -void JitShader::Compile_MUL(Instruction instr) { - Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); - Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); - Compile_SanitizedMul(SRC1, SRC2, SCRATCH); - Compile_DestEnable(instr, SRC1); -} - -void JitShader::Compile_SGE(Instruction instr) { - if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::SGEI) { - Compile_SwizzleSrc(instr, 1, instr.common.src1i, SRC1); - Compile_SwizzleSrc(instr, 2, instr.common.src2i, SRC2); - } else { - Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); - Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); - } - - cmpleps(SRC2, SRC1); - andps(SRC2, ONE); - - Compile_DestEnable(instr, SRC2); -} - -void JitShader::Compile_SLT(Instruction instr) { - if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::SLTI) { - Compile_SwizzleSrc(instr, 1, instr.common.src1i, SRC1); - Compile_SwizzleSrc(instr, 2, instr.common.src2i, SRC2); - } else { - Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); - Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); - } - - cmpltps(SRC1, SRC2); - andps(SRC1, ONE); - - Compile_DestEnable(instr, SRC1); -} - -void JitShader::Compile_FLR(Instruction instr) { - Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); - - if (Common::GetCPUCaps().sse4_1) { - roundps(SRC1, SRC1, _MM_FROUND_FLOOR); - } else { - cvttps2dq(SRC1, SRC1); - cvtdq2ps(SRC1, SRC1); - } - - Compile_DestEnable(instr, SRC1); -} - -void JitShader::Compile_MAX(Instruction instr) { - Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); - Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); - // SSE semantics match PICA200 ones: In case of NaN, SRC2 is returned. - maxps(SRC1, SRC2); - Compile_DestEnable(instr, SRC1); -} - -void JitShader::Compile_MIN(Instruction instr) { - Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); - Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); - // SSE semantics match PICA200 ones: In case of NaN, SRC2 is returned. - minps(SRC1, SRC2); - Compile_DestEnable(instr, SRC1); -} - -void JitShader::Compile_MOVA(Instruction instr) { - SwizzlePattern swiz = {g_state.vs.swizzle_data[instr.common.operand_desc_id]}; - - if (!swiz.DestComponentEnabled(0) && !swiz.DestComponentEnabled(1)) { - return; // NoOp - } - - Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); - - // Convert floats to integers using truncation (only care about X and Y components) - cvttps2dq(SRC1, SRC1); - - // Get result - movq(rax, SRC1); - - // Handle destination enable - if (swiz.DestComponentEnabled(0) && swiz.DestComponentEnabled(1)) { - // Move and sign-extend low 32 bits - movsxd(ADDROFFS_REG_0, eax); - - // Move and sign-extend high 32 bits - shr(rax, 32); - movsxd(ADDROFFS_REG_1, eax); - - // Multiply by 16 to be used as an offset later - shl(ADDROFFS_REG_0, 4); - shl(ADDROFFS_REG_1, 4); - } else { - if (swiz.DestComponentEnabled(0)) { - // Move and sign-extend low 32 bits - movsxd(ADDROFFS_REG_0, eax); - - // Multiply by 16 to be used as an offset later - shl(ADDROFFS_REG_0, 4); - } else if (swiz.DestComponentEnabled(1)) { - // Move and sign-extend high 32 bits - shr(rax, 32); - movsxd(ADDROFFS_REG_1, eax); - - // Multiply by 16 to be used as an offset later - shl(ADDROFFS_REG_1, 4); - } - } -} - -void JitShader::Compile_MOV(Instruction instr) { - Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); - Compile_DestEnable(instr, SRC1); -} - -void JitShader::Compile_RCP(Instruction instr) { - Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); - - // TODO(bunnei): RCPSS is a pretty rough approximation, this might cause problems if Pica - // performs this operation more accurately. This should be checked on hardware. - rcpss(SRC1, SRC1); - shufps(SRC1, SRC1, _MM_SHUFFLE(0, 0, 0, 0)); // XYWZ -> XXXX - - Compile_DestEnable(instr, SRC1); -} - -void JitShader::Compile_RSQ(Instruction instr) { - Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); - - // TODO(bunnei): RSQRTSS is a pretty rough approximation, this might cause problems if Pica - // performs this operation more accurately. This should be checked on hardware. - rsqrtss(SRC1, SRC1); - shufps(SRC1, SRC1, _MM_SHUFFLE(0, 0, 0, 0)); // XYWZ -> XXXX - - Compile_DestEnable(instr, SRC1); -} - -void JitShader::Compile_NOP(Instruction instr) {} - -void JitShader::Compile_END(Instruction instr) { - ABI_PopRegistersAndAdjustStack(*this, ABI_ALL_CALLEE_SAVED, 8); - ret(); -} - -void JitShader::Compile_CALL(Instruction instr) { - // Push offset of the return - push(qword, (instr.flow_control.dest_offset + instr.flow_control.num_instructions)); - - // Call the subroutine - call(instruction_labels[instr.flow_control.dest_offset]); - - // Skip over the return offset that's on the stack - add(rsp, 8); -} - -void JitShader::Compile_CALLC(Instruction instr) { - Compile_EvaluateCondition(instr); - Label b; - jz(b); - Compile_CALL(instr); - L(b); -} - -void JitShader::Compile_CALLU(Instruction instr) { - Compile_UniformCondition(instr); - Label b; - jz(b); - Compile_CALL(instr); - L(b); -} - -void JitShader::Compile_CMP(Instruction instr) { - using Op = Instruction::Common::CompareOpType::Op; - Op op_x = instr.common.compare_op.x; - Op op_y = instr.common.compare_op.y; - - Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); - Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); - - // SSE doesn't have greater-than (GT) or greater-equal (GE) comparison operators. You need to - // emulate them by swapping the lhs and rhs and using LT and LE. NLT and NLE can't be used here - // because they don't match when used with NaNs. - static const u8 cmp[] = {CMP_EQ, CMP_NEQ, CMP_LT, CMP_LE, CMP_LT, CMP_LE}; - - bool invert_op_x = (op_x == Op::GreaterThan || op_x == Op::GreaterEqual); - Xmm lhs_x = invert_op_x ? SRC2 : SRC1; - Xmm rhs_x = invert_op_x ? SRC1 : SRC2; - - if (op_x == op_y) { - // Compare X-component and Y-component together - cmpps(lhs_x, rhs_x, cmp[op_x]); - movq(COND0, lhs_x); - - mov(COND1, COND0); - } else { - bool invert_op_y = (op_y == Op::GreaterThan || op_y == Op::GreaterEqual); - Xmm lhs_y = invert_op_y ? SRC2 : SRC1; - Xmm rhs_y = invert_op_y ? SRC1 : SRC2; - - // Compare X-component - movaps(SCRATCH, lhs_x); - cmpss(SCRATCH, rhs_x, cmp[op_x]); - - // Compare Y-component - cmpps(lhs_y, rhs_y, cmp[op_y]); - - movq(COND0, SCRATCH); - movq(COND1, lhs_y); - } - - shr(COND0.cvt32(), 31); // ignores upper 32 bits in source - shr(COND1, 63); -} - -void JitShader::Compile_MAD(Instruction instr) { - Compile_SwizzleSrc(instr, 1, instr.mad.src1, SRC1); - - if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MADI) { - Compile_SwizzleSrc(instr, 2, instr.mad.src2i, SRC2); - Compile_SwizzleSrc(instr, 3, instr.mad.src3i, SRC3); - } else { - Compile_SwizzleSrc(instr, 2, instr.mad.src2, SRC2); - Compile_SwizzleSrc(instr, 3, instr.mad.src3, SRC3); - } - - Compile_SanitizedMul(SRC1, SRC2, SCRATCH); - addps(SRC1, SRC3); - - Compile_DestEnable(instr, SRC1); -} - -void JitShader::Compile_IF(Instruction instr) { - Compile_Assert(instr.flow_control.dest_offset >= program_counter, - "Backwards if-statements not supported"); - Label l_else, l_endif; - - // Evaluate the "IF" condition - if (instr.opcode.Value() == OpCode::Id::IFU) { - Compile_UniformCondition(instr); - } else if (instr.opcode.Value() == OpCode::Id::IFC) { - Compile_EvaluateCondition(instr); - } - jz(l_else, T_NEAR); - - // Compile the code that corresponds to the condition evaluating as true - Compile_Block(instr.flow_control.dest_offset); - - // If there isn't an "ELSE" condition, we are done here - if (instr.flow_control.num_instructions == 0) { - L(l_else); - return; - } - - jmp(l_endif, T_NEAR); - - L(l_else); - // This code corresponds to the "ELSE" condition - // Comple the code that corresponds to the condition evaluating as false - Compile_Block(instr.flow_control.dest_offset + instr.flow_control.num_instructions); - - L(l_endif); -} - -void JitShader::Compile_LOOP(Instruction instr) { - Compile_Assert(instr.flow_control.dest_offset >= program_counter, - "Backwards loops not supported"); - Compile_Assert(!looping, "Nested loops not supported"); - - looping = true; - - // This decodes the fields from the integer uniform at index instr.flow_control.int_uniform_id. - // The Y (LOOPCOUNT_REG) and Z (LOOPINC) component are kept multiplied by 16 (Left shifted by - // 4 bits) to be used as an offset into the 16-byte vector registers later - size_t offset = ShaderSetup::GetIntUniformOffset(instr.flow_control.int_uniform_id); - mov(LOOPCOUNT, dword[SETUP + offset]); - mov(LOOPCOUNT_REG, LOOPCOUNT); - shr(LOOPCOUNT_REG, 4); - and(LOOPCOUNT_REG, 0xFF0); // Y-component is the start - mov(LOOPINC, LOOPCOUNT); - shr(LOOPINC, 12); - and(LOOPINC, 0xFF0); // Z-component is the incrementer - movzx(LOOPCOUNT, LOOPCOUNT.cvt8()); // X-component is iteration count - add(LOOPCOUNT, 1); // Iteration count is X-component + 1 - - Label l_loop_start; - L(l_loop_start); - - Compile_Block(instr.flow_control.dest_offset + 1); - - add(LOOPCOUNT_REG, LOOPINC); // Increment LOOPCOUNT_REG by Z-component - sub(LOOPCOUNT, 1); // Increment loop count by 1 - jnz(l_loop_start); // Loop if not equal - - looping = false; -} - -void JitShader::Compile_JMP(Instruction instr) { - if (instr.opcode.Value() == OpCode::Id::JMPC) - Compile_EvaluateCondition(instr); - else if (instr.opcode.Value() == OpCode::Id::JMPU) - Compile_UniformCondition(instr); - else - UNREACHABLE(); - - bool inverted_condition = - (instr.opcode.Value() == OpCode::Id::JMPU) && (instr.flow_control.num_instructions & 1); - - Label& b = instruction_labels[instr.flow_control.dest_offset]; - if (inverted_condition) { - jz(b, T_NEAR); - } else { - jnz(b, T_NEAR); - } -} - -void JitShader::Compile_Block(unsigned end) { - while (program_counter < end) { - Compile_NextInstr(); - } -} - -void JitShader::Compile_Return() { - // Peek return offset on the stack and check if we're at that offset - mov(rax, qword[rsp + 8]); - cmp(eax, (program_counter)); - - // If so, jump back to before CALL - Label b; - jnz(b); - ret(); - L(b); -} - -void JitShader::Compile_NextInstr() { - if (std::binary_search(return_offsets.begin(), return_offsets.end(), program_counter)) { - Compile_Return(); - } - - L(instruction_labels[program_counter]); - - Instruction instr = GetVertexShaderInstruction(program_counter++); - - OpCode::Id opcode = instr.opcode.Value(); - auto instr_func = instr_table[static_cast(opcode)]; - - if (instr_func) { - // JIT the instruction! - ((*this).*instr_func)(instr); - } else { - // Unhandled instruction - LOG_CRITICAL(HW_GPU, "Unhandled instruction: 0x%02x (0x%08x)", - instr.opcode.Value().EffectiveOpCode(), instr.hex); - } -} - -void JitShader::FindReturnOffsets() { - return_offsets.clear(); - - for (size_t offset = 0; offset < g_state.vs.program_code.size(); ++offset) { - Instruction instr = GetVertexShaderInstruction(offset); - - switch (instr.opcode.Value()) { - case OpCode::Id::CALL: - case OpCode::Id::CALLC: - case OpCode::Id::CALLU: - return_offsets.push_back(instr.flow_control.dest_offset + - instr.flow_control.num_instructions); - break; - default: - break; - } - } - - // Sort for efficient binary search later - std::sort(return_offsets.begin(), return_offsets.end()); -} - -void JitShader::Compile() { - // Reset flow control state - program = (CompiledShader*)getCurr(); - program_counter = 0; - looping = false; - instruction_labels.fill(Xbyak::Label()); - - // Find all `CALL` instructions and identify return locations - FindReturnOffsets(); - - // The stack pointer is 8 modulo 16 at the entry of a procedure - ABI_PushRegistersAndAdjustStack(*this, ABI_ALL_CALLEE_SAVED, 8); - - mov(SETUP, ABI_PARAM1); - mov(STATE, ABI_PARAM2); - - // Zero address/loop registers - xor(ADDROFFS_REG_0.cvt32(), ADDROFFS_REG_0.cvt32()); - xor(ADDROFFS_REG_1.cvt32(), ADDROFFS_REG_1.cvt32()); - xor(LOOPCOUNT_REG, LOOPCOUNT_REG); - - // Used to set a register to one - static const __m128 one = {1.f, 1.f, 1.f, 1.f}; - mov(rax, reinterpret_cast(&one)); - movaps(ONE, xword[rax]); - - // Used to negate registers - static const __m128 neg = {-0.f, -0.f, -0.f, -0.f}; - mov(rax, reinterpret_cast(&neg)); - movaps(NEGBIT, xword[rax]); - - // Jump to start of the shader program - jmp(ABI_PARAM3); - - // Compile entire program - Compile_Block(static_cast(g_state.vs.program_code.size())); - - // Free memory that's no longer needed - return_offsets.clear(); - return_offsets.shrink_to_fit(); - - ready(); - - uintptr_t size = reinterpret_cast(getCurr()) - reinterpret_cast(program); - ASSERT_MSG(size <= MAX_SHADER_SIZE, "Compiled a shader that exceeds the allocated size!"); - LOG_DEBUG(HW_GPU, "Compiled shader size=%lu", size); -} - -JitShader::JitShader() : Xbyak::CodeGenerator(MAX_SHADER_SIZE) {} - -} // namespace Shader - -} // namespace Pica diff --git a/src/video_core/shader/shader_jit_x64.h b/src/video_core/shader/shader_jit_x64.h deleted file mode 100644 index f37548306..000000000 --- a/src/video_core/shader/shader_jit_x64.h +++ /dev/null @@ -1,121 +0,0 @@ -// Copyright 2015 Citra Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#pragma once - -#include -#include -#include -#include -#include -#include -#include "common/bit_set.h" -#include "common/common_types.h" -#include "common/x64/emitter.h" -#include "video_core/shader/shader.h" - -using nihstro::Instruction; -using nihstro::OpCode; -using nihstro::SwizzlePattern; - -namespace Pica { - -namespace Shader { - -/// Memory allocated for each compiled shader (64Kb) -constexpr size_t MAX_SHADER_SIZE = 1024 * 64; - -/** - * This class implements the shader JIT compiler. It recompiles a Pica shader program into x86_64 - * code that can be executed on the host machine directly. - */ -class JitShader : public Xbyak::CodeGenerator { -public: - JitShader(); - - void Run(const ShaderSetup& setup, UnitState& state, unsigned offset) const { - program(&setup, &state, instruction_labels[offset].getAddress()); - } - - void Compile(); - - void Compile_ADD(Instruction instr); - void Compile_DP3(Instruction instr); - void Compile_DP4(Instruction instr); - void Compile_DPH(Instruction instr); - void Compile_EX2(Instruction instr); - void Compile_LG2(Instruction instr); - void Compile_MUL(Instruction instr); - void Compile_SGE(Instruction instr); - void Compile_SLT(Instruction instr); - void Compile_FLR(Instruction instr); - void Compile_MAX(Instruction instr); - void Compile_MIN(Instruction instr); - void Compile_RCP(Instruction instr); - void Compile_RSQ(Instruction instr); - void Compile_MOVA(Instruction instr); - void Compile_MOV(Instruction instr); - void Compile_NOP(Instruction instr); - void Compile_END(Instruction instr); - void Compile_CALL(Instruction instr); - void Compile_CALLC(Instruction instr); - void Compile_CALLU(Instruction instr); - void Compile_IF(Instruction instr); - void Compile_LOOP(Instruction instr); - void Compile_JMP(Instruction instr); - void Compile_CMP(Instruction instr); - void Compile_MAD(Instruction instr); - -private: - void Compile_Block(unsigned end); - void Compile_NextInstr(); - - void Compile_SwizzleSrc(Instruction instr, unsigned src_num, SourceRegister src_reg, - Xbyak::Xmm dest); - void Compile_DestEnable(Instruction instr, Xbyak::Xmm dest); - - /** - * Compiles a `MUL src1, src2` operation, properly handling the PICA semantics when multiplying - * zero by inf. Clobbers `src2` and `scratch`. - */ - void Compile_SanitizedMul(Xbyak::Xmm src1, Xbyak::Xmm src2, Xbyak::Xmm scratch); - - void Compile_EvaluateCondition(Instruction instr); - void Compile_UniformCondition(Instruction instr); - - /** - * Emits the code to conditionally return from a subroutine envoked by the `CALL` instruction. - */ - void Compile_Return(); - - BitSet32 PersistentCallerSavedRegs(); - - /** - * Assertion evaluated at compile-time, but only triggered if executed at runtime. - * @param msg Message to be logged if the assertion fails. - */ - void Compile_Assert(bool condition, const char* msg); - - /** - * Analyzes the entire shader program for `CALL` instructions before emitting any code, - * identifying the locations where a return needs to be inserted. - */ - void FindReturnOffsets(); - - /// Mapping of Pica VS instructions to pointers in the emitted code - std::array instruction_labels; - - /// Offsets in code where a return needs to be inserted - std::vector return_offsets; - - unsigned program_counter = 0; ///< Offset of the next instruction to decode - bool looping = false; ///< True if compiling a loop, used to check for nested loops - - using CompiledShader = void(const void* setup, void* state, const u8* start_addr); - CompiledShader* program = nullptr; -}; - -} // Shader - -} // Pica diff --git a/src/video_core/shader/shader_jit_x64_compiler.cpp b/src/video_core/shader/shader_jit_x64_compiler.cpp new file mode 100644 index 000000000..880543306 --- /dev/null +++ b/src/video_core/shader/shader_jit_x64_compiler.cpp @@ -0,0 +1,888 @@ +// Copyright 2015 Citra Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include +#include +#include +#include +#include +#include +#include "common/assert.h" +#include "common/logging/log.h" +#include "common/vector_math.h" +#include "common/x64/cpu_detect.h" +#include "common/x64/xbyak_abi.h" +#include "common/x64/xbyak_util.h" +#include "video_core/pica_state.h" +#include "video_core/pica_types.h" +#include "video_core/shader/shader.h" +#include "video_core/shader/shader_jit_x64_compiler.h" + +using namespace Common::X64; +using namespace Xbyak::util; +using Xbyak::Label; +using Xbyak::Reg32; +using Xbyak::Reg64; +using Xbyak::Xmm; + +namespace Pica { + +namespace Shader { + +typedef void (JitShader::*JitFunction)(Instruction instr); + +const JitFunction instr_table[64] = { + &JitShader::Compile_ADD, // add + &JitShader::Compile_DP3, // dp3 + &JitShader::Compile_DP4, // dp4 + &JitShader::Compile_DPH, // dph + nullptr, // unknown + &JitShader::Compile_EX2, // ex2 + &JitShader::Compile_LG2, // lg2 + nullptr, // unknown + &JitShader::Compile_MUL, // mul + &JitShader::Compile_SGE, // sge + &JitShader::Compile_SLT, // slt + &JitShader::Compile_FLR, // flr + &JitShader::Compile_MAX, // max + &JitShader::Compile_MIN, // min + &JitShader::Compile_RCP, // rcp + &JitShader::Compile_RSQ, // rsq + nullptr, // unknown + nullptr, // unknown + &JitShader::Compile_MOVA, // mova + &JitShader::Compile_MOV, // mov + nullptr, // unknown + nullptr, // unknown + nullptr, // unknown + nullptr, // unknown + &JitShader::Compile_DPH, // dphi + nullptr, // unknown + &JitShader::Compile_SGE, // sgei + &JitShader::Compile_SLT, // slti + nullptr, // unknown + nullptr, // unknown + nullptr, // unknown + nullptr, // unknown + nullptr, // unknown + &JitShader::Compile_NOP, // nop + &JitShader::Compile_END, // end + nullptr, // break + &JitShader::Compile_CALL, // call + &JitShader::Compile_CALLC, // callc + &JitShader::Compile_CALLU, // callu + &JitShader::Compile_IF, // ifu + &JitShader::Compile_IF, // ifc + &JitShader::Compile_LOOP, // loop + nullptr, // emit + nullptr, // sete + &JitShader::Compile_JMP, // jmpc + &JitShader::Compile_JMP, // jmpu + &JitShader::Compile_CMP, // cmp + &JitShader::Compile_CMP, // cmp + &JitShader::Compile_MAD, // madi + &JitShader::Compile_MAD, // madi + &JitShader::Compile_MAD, // madi + &JitShader::Compile_MAD, // madi + &JitShader::Compile_MAD, // madi + &JitShader::Compile_MAD, // madi + &JitShader::Compile_MAD, // madi + &JitShader::Compile_MAD, // madi + &JitShader::Compile_MAD, // mad + &JitShader::Compile_MAD, // mad + &JitShader::Compile_MAD, // mad + &JitShader::Compile_MAD, // mad + &JitShader::Compile_MAD, // mad + &JitShader::Compile_MAD, // mad + &JitShader::Compile_MAD, // mad + &JitShader::Compile_MAD, // mad +}; + +// The following is used to alias some commonly used registers. Generally, RAX-RDX and XMM0-XMM3 can +// be used as scratch registers within a compiler function. The other registers have designated +// purposes, as documented below: + +/// Pointer to the uniform memory +static const Reg64 SETUP = r9; +/// The two 32-bit VS address offset registers set by the MOVA instruction +static const Reg64 ADDROFFS_REG_0 = r10; +static const Reg64 ADDROFFS_REG_1 = r11; +/// VS loop count register (Multiplied by 16) +static const Reg32 LOOPCOUNT_REG = r12d; +/// Current VS loop iteration number (we could probably use LOOPCOUNT_REG, but this quicker) +static const Reg32 LOOPCOUNT = esi; +/// Number to increment LOOPCOUNT_REG by on each loop iteration (Multiplied by 16) +static const Reg32 LOOPINC = edi; +/// Result of the previous CMP instruction for the X-component comparison +static const Reg64 COND0 = r13; +/// Result of the previous CMP instruction for the Y-component comparison +static const Reg64 COND1 = r14; +/// Pointer to the UnitState instance for the current VS unit +static const Reg64 STATE = r15; +/// SIMD scratch register +static const Xmm SCRATCH = xmm0; +/// Loaded with the first swizzled source register, otherwise can be used as a scratch register +static const Xmm SRC1 = xmm1; +/// Loaded with the second swizzled source register, otherwise can be used as a scratch register +static const Xmm SRC2 = xmm2; +/// Loaded with the third swizzled source register, otherwise can be used as a scratch register +static const Xmm SRC3 = xmm3; +/// Additional scratch register +static const Xmm SCRATCH2 = xmm4; +/// Constant vector of [1.0f, 1.0f, 1.0f, 1.0f], used to efficiently set a vector to one +static const Xmm ONE = xmm14; +/// Constant vector of [-0.f, -0.f, -0.f, -0.f], used to efficiently negate a vector with XOR +static const Xmm NEGBIT = xmm15; + +// State registers that must not be modified by external functions calls +// Scratch registers, e.g., SRC1 and SCRATCH, have to be saved on the side if needed +static const BitSet32 persistent_regs = BuildRegSet({ + // Pointers to register blocks + SETUP, STATE, + // Cached registers + ADDROFFS_REG_0, ADDROFFS_REG_1, LOOPCOUNT_REG, COND0, COND1, + // Constants + ONE, NEGBIT, +}); + +/// Raw constant for the source register selector that indicates no swizzling is performed +static const u8 NO_SRC_REG_SWIZZLE = 0x1b; +/// Raw constant for the destination register enable mask that indicates all components are enabled +static const u8 NO_DEST_REG_MASK = 0xf; + +/** + * Get the vertex shader instruction for a given offset in the current shader program + * @param offset Offset in the current shader program of the instruction + * @return Instruction at the specified offset + */ +static Instruction GetVertexShaderInstruction(size_t offset) { + return {g_state.vs.program_code[offset]}; +} + +static void LogCritical(const char* msg) { + LOG_CRITICAL(HW_GPU, "%s", msg); +} + +void JitShader::Compile_Assert(bool condition, const char* msg) { + if (!condition) { + mov(ABI_PARAM1, reinterpret_cast(msg)); + CallFarFunction(*this, LogCritical); + } +} + +/** + * Loads and swizzles a source register into the specified XMM register. + * @param instr VS instruction, used for determining how to load the source register + * @param src_num Number indicating which source register to load (1 = src1, 2 = src2, 3 = src3) + * @param src_reg SourceRegister object corresponding to the source register to load + * @param dest Destination XMM register to store the loaded, swizzled source register + */ +void JitShader::Compile_SwizzleSrc(Instruction instr, unsigned src_num, SourceRegister src_reg, + Xmm dest) { + Reg64 src_ptr; + size_t src_offset; + + if (src_reg.GetRegisterType() == RegisterType::FloatUniform) { + src_ptr = SETUP; + src_offset = ShaderSetup::GetFloatUniformOffset(src_reg.GetIndex()); + } else { + src_ptr = STATE; + src_offset = UnitState::InputOffset(src_reg); + } + + int src_offset_disp = (int)src_offset; + ASSERT_MSG(src_offset == src_offset_disp, "Source register offset too large for int type"); + + unsigned operand_desc_id; + + const bool is_inverted = + (0 != (instr.opcode.Value().GetInfo().subtype & OpCode::Info::SrcInversed)); + + unsigned address_register_index; + unsigned offset_src; + + if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MAD || + instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MADI) { + operand_desc_id = instr.mad.operand_desc_id; + offset_src = is_inverted ? 3 : 2; + address_register_index = instr.mad.address_register_index; + } else { + operand_desc_id = instr.common.operand_desc_id; + offset_src = is_inverted ? 2 : 1; + address_register_index = instr.common.address_register_index; + } + + if (src_num == offset_src && address_register_index != 0) { + switch (address_register_index) { + case 1: // address offset 1 + movaps(dest, xword[src_ptr + ADDROFFS_REG_0 + src_offset_disp]); + break; + case 2: // address offset 2 + movaps(dest, xword[src_ptr + ADDROFFS_REG_1 + src_offset_disp]); + break; + case 3: // address offset 3 + movaps(dest, xword[src_ptr + LOOPCOUNT_REG.cvt64() + src_offset_disp]); + break; + default: + UNREACHABLE(); + break; + } + } else { + // Load the source + movaps(dest, xword[src_ptr + src_offset_disp]); + } + + SwizzlePattern swiz = {g_state.vs.swizzle_data[operand_desc_id]}; + + // Generate instructions for source register swizzling as needed + u8 sel = swiz.GetRawSelector(src_num); + if (sel != NO_SRC_REG_SWIZZLE) { + // Selector component order needs to be reversed for the SHUFPS instruction + sel = ((sel & 0xc0) >> 6) | ((sel & 3) << 6) | ((sel & 0xc) << 2) | ((sel & 0x30) >> 2); + + // Shuffle inputs for swizzle + shufps(dest, dest, sel); + } + + // If the source register should be negated, flip the negative bit using XOR + const bool negate[] = {swiz.negate_src1, swiz.negate_src2, swiz.negate_src3}; + if (negate[src_num - 1]) { + xorps(dest, NEGBIT); + } +} + +void JitShader::Compile_DestEnable(Instruction instr, Xmm src) { + DestRegister dest; + unsigned operand_desc_id; + if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MAD || + instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MADI) { + operand_desc_id = instr.mad.operand_desc_id; + dest = instr.mad.dest.Value(); + } else { + operand_desc_id = instr.common.operand_desc_id; + dest = instr.common.dest.Value(); + } + + SwizzlePattern swiz = {g_state.vs.swizzle_data[operand_desc_id]}; + + size_t dest_offset_disp = UnitState::OutputOffset(dest); + + // If all components are enabled, write the result to the destination register + if (swiz.dest_mask == NO_DEST_REG_MASK) { + // Store dest back to memory + movaps(xword[STATE + dest_offset_disp], src); + + } else { + // Not all components are enabled, so mask the result when storing to the destination + // register... + movaps(SCRATCH, xword[STATE + dest_offset_disp]); + + if (Common::GetCPUCaps().sse4_1) { + u8 mask = ((swiz.dest_mask & 1) << 3) | ((swiz.dest_mask & 8) >> 3) | + ((swiz.dest_mask & 2) << 1) | ((swiz.dest_mask & 4) >> 1); + blendps(SCRATCH, src, mask); + } else { + movaps(SCRATCH2, src); + unpckhps(SCRATCH2, SCRATCH); // Unpack X/Y components of source and destination + unpcklps(SCRATCH, src); // Unpack Z/W components of source and destination + + // Compute selector to selectively copy source components to destination for SHUFPS + // instruction + u8 sel = ((swiz.DestComponentEnabled(0) ? 1 : 0) << 0) | + ((swiz.DestComponentEnabled(1) ? 3 : 2) << 2) | + ((swiz.DestComponentEnabled(2) ? 0 : 1) << 4) | + ((swiz.DestComponentEnabled(3) ? 2 : 3) << 6); + shufps(SCRATCH, SCRATCH2, sel); + } + + // Store dest back to memory + movaps(xword[STATE + dest_offset_disp], SCRATCH); + } +} + +void JitShader::Compile_SanitizedMul(Xmm src1, Xmm src2, Xmm scratch) { + movaps(scratch, src1); + cmpordps(scratch, src2); + + mulps(src1, src2); + + movaps(src2, src1); + cmpunordps(src2, src2); + + xorps(scratch, src2); + andps(src1, scratch); +} + +void JitShader::Compile_EvaluateCondition(Instruction instr) { + // Note: NXOR is used below to check for equality + switch (instr.flow_control.op) { + case Instruction::FlowControlType::Or: + mov(eax, COND0); + mov(ebx, COND1); + xor(eax, (instr.flow_control.refx.Value() ^ 1)); + xor(ebx, (instr.flow_control.refy.Value() ^ 1)); + or (eax, ebx); + break; + + case Instruction::FlowControlType::And: + mov(eax, COND0); + mov(ebx, COND1); + xor(eax, (instr.flow_control.refx.Value() ^ 1)); + xor(ebx, (instr.flow_control.refy.Value() ^ 1)); + and(eax, ebx); + break; + + case Instruction::FlowControlType::JustX: + mov(eax, COND0); + xor(eax, (instr.flow_control.refx.Value() ^ 1)); + break; + + case Instruction::FlowControlType::JustY: + mov(eax, COND1); + xor(eax, (instr.flow_control.refy.Value() ^ 1)); + break; + } +} + +void JitShader::Compile_UniformCondition(Instruction instr) { + size_t offset = ShaderSetup::GetBoolUniformOffset(instr.flow_control.bool_uniform_id); + cmp(byte[SETUP + offset], 0); +} + +BitSet32 JitShader::PersistentCallerSavedRegs() { + return persistent_regs & ABI_ALL_CALLER_SAVED; +} + +void JitShader::Compile_ADD(Instruction instr) { + Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); + Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); + addps(SRC1, SRC2); + Compile_DestEnable(instr, SRC1); +} + +void JitShader::Compile_DP3(Instruction instr) { + Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); + Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); + + Compile_SanitizedMul(SRC1, SRC2, SCRATCH); + + movaps(SRC2, SRC1); + shufps(SRC2, SRC2, _MM_SHUFFLE(1, 1, 1, 1)); + + movaps(SRC3, SRC1); + shufps(SRC3, SRC3, _MM_SHUFFLE(2, 2, 2, 2)); + + shufps(SRC1, SRC1, _MM_SHUFFLE(0, 0, 0, 0)); + addps(SRC1, SRC2); + addps(SRC1, SRC3); + + Compile_DestEnable(instr, SRC1); +} + +void JitShader::Compile_DP4(Instruction instr) { + Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); + Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); + + Compile_SanitizedMul(SRC1, SRC2, SCRATCH); + + movaps(SRC2, SRC1); + shufps(SRC1, SRC1, _MM_SHUFFLE(2, 3, 0, 1)); // XYZW -> ZWXY + addps(SRC1, SRC2); + + movaps(SRC2, SRC1); + shufps(SRC1, SRC1, _MM_SHUFFLE(0, 1, 2, 3)); // XYZW -> WZYX + addps(SRC1, SRC2); + + Compile_DestEnable(instr, SRC1); +} + +void JitShader::Compile_DPH(Instruction instr) { + if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::DPHI) { + Compile_SwizzleSrc(instr, 1, instr.common.src1i, SRC1); + Compile_SwizzleSrc(instr, 2, instr.common.src2i, SRC2); + } else { + Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); + Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); + } + + if (Common::GetCPUCaps().sse4_1) { + // Set 4th component to 1.0 + blendps(SRC1, ONE, 0b1000); + } else { + // Set 4th component to 1.0 + movaps(SCRATCH, SRC1); + unpckhps(SCRATCH, ONE); // XYZW, 1111 -> Z1__ + unpcklpd(SRC1, SCRATCH); // XYZW, Z1__ -> XYZ1 + } + + Compile_SanitizedMul(SRC1, SRC2, SCRATCH); + + movaps(SRC2, SRC1); + shufps(SRC1, SRC1, _MM_SHUFFLE(2, 3, 0, 1)); // XYZW -> ZWXY + addps(SRC1, SRC2); + + movaps(SRC2, SRC1); + shufps(SRC1, SRC1, _MM_SHUFFLE(0, 1, 2, 3)); // XYZW -> WZYX + addps(SRC1, SRC2); + + Compile_DestEnable(instr, SRC1); +} + +void JitShader::Compile_EX2(Instruction instr) { + Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); + movss(xmm0, SRC1); // ABI_PARAM1 + + ABI_PushRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0); + CallFarFunction(*this, exp2f); + ABI_PopRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0); + + shufps(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); // ABI_RETURN + movaps(SRC1, xmm0); + Compile_DestEnable(instr, SRC1); +} + +void JitShader::Compile_LG2(Instruction instr) { + Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); + movss(xmm0, SRC1); // ABI_PARAM1 + + ABI_PushRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0); + CallFarFunction(*this, log2f); + ABI_PopRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0); + + shufps(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); // ABI_RETURN + movaps(SRC1, xmm0); + Compile_DestEnable(instr, SRC1); +} + +void JitShader::Compile_MUL(Instruction instr) { + Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); + Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); + Compile_SanitizedMul(SRC1, SRC2, SCRATCH); + Compile_DestEnable(instr, SRC1); +} + +void JitShader::Compile_SGE(Instruction instr) { + if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::SGEI) { + Compile_SwizzleSrc(instr, 1, instr.common.src1i, SRC1); + Compile_SwizzleSrc(instr, 2, instr.common.src2i, SRC2); + } else { + Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); + Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); + } + + cmpleps(SRC2, SRC1); + andps(SRC2, ONE); + + Compile_DestEnable(instr, SRC2); +} + +void JitShader::Compile_SLT(Instruction instr) { + if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::SLTI) { + Compile_SwizzleSrc(instr, 1, instr.common.src1i, SRC1); + Compile_SwizzleSrc(instr, 2, instr.common.src2i, SRC2); + } else { + Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); + Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); + } + + cmpltps(SRC1, SRC2); + andps(SRC1, ONE); + + Compile_DestEnable(instr, SRC1); +} + +void JitShader::Compile_FLR(Instruction instr) { + Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); + + if (Common::GetCPUCaps().sse4_1) { + roundps(SRC1, SRC1, _MM_FROUND_FLOOR); + } else { + cvttps2dq(SRC1, SRC1); + cvtdq2ps(SRC1, SRC1); + } + + Compile_DestEnable(instr, SRC1); +} + +void JitShader::Compile_MAX(Instruction instr) { + Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); + Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); + // SSE semantics match PICA200 ones: In case of NaN, SRC2 is returned. + maxps(SRC1, SRC2); + Compile_DestEnable(instr, SRC1); +} + +void JitShader::Compile_MIN(Instruction instr) { + Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); + Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); + // SSE semantics match PICA200 ones: In case of NaN, SRC2 is returned. + minps(SRC1, SRC2); + Compile_DestEnable(instr, SRC1); +} + +void JitShader::Compile_MOVA(Instruction instr) { + SwizzlePattern swiz = {g_state.vs.swizzle_data[instr.common.operand_desc_id]}; + + if (!swiz.DestComponentEnabled(0) && !swiz.DestComponentEnabled(1)) { + return; // NoOp + } + + Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); + + // Convert floats to integers using truncation (only care about X and Y components) + cvttps2dq(SRC1, SRC1); + + // Get result + movq(rax, SRC1); + + // Handle destination enable + if (swiz.DestComponentEnabled(0) && swiz.DestComponentEnabled(1)) { + // Move and sign-extend low 32 bits + movsxd(ADDROFFS_REG_0, eax); + + // Move and sign-extend high 32 bits + shr(rax, 32); + movsxd(ADDROFFS_REG_1, eax); + + // Multiply by 16 to be used as an offset later + shl(ADDROFFS_REG_0, 4); + shl(ADDROFFS_REG_1, 4); + } else { + if (swiz.DestComponentEnabled(0)) { + // Move and sign-extend low 32 bits + movsxd(ADDROFFS_REG_0, eax); + + // Multiply by 16 to be used as an offset later + shl(ADDROFFS_REG_0, 4); + } else if (swiz.DestComponentEnabled(1)) { + // Move and sign-extend high 32 bits + shr(rax, 32); + movsxd(ADDROFFS_REG_1, eax); + + // Multiply by 16 to be used as an offset later + shl(ADDROFFS_REG_1, 4); + } + } +} + +void JitShader::Compile_MOV(Instruction instr) { + Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); + Compile_DestEnable(instr, SRC1); +} + +void JitShader::Compile_RCP(Instruction instr) { + Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); + + // TODO(bunnei): RCPSS is a pretty rough approximation, this might cause problems if Pica + // performs this operation more accurately. This should be checked on hardware. + rcpss(SRC1, SRC1); + shufps(SRC1, SRC1, _MM_SHUFFLE(0, 0, 0, 0)); // XYWZ -> XXXX + + Compile_DestEnable(instr, SRC1); +} + +void JitShader::Compile_RSQ(Instruction instr) { + Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); + + // TODO(bunnei): RSQRTSS is a pretty rough approximation, this might cause problems if Pica + // performs this operation more accurately. This should be checked on hardware. + rsqrtss(SRC1, SRC1); + shufps(SRC1, SRC1, _MM_SHUFFLE(0, 0, 0, 0)); // XYWZ -> XXXX + + Compile_DestEnable(instr, SRC1); +} + +void JitShader::Compile_NOP(Instruction instr) {} + +void JitShader::Compile_END(Instruction instr) { + ABI_PopRegistersAndAdjustStack(*this, ABI_ALL_CALLEE_SAVED, 8); + ret(); +} + +void JitShader::Compile_CALL(Instruction instr) { + // Push offset of the return + push(qword, (instr.flow_control.dest_offset + instr.flow_control.num_instructions)); + + // Call the subroutine + call(instruction_labels[instr.flow_control.dest_offset]); + + // Skip over the return offset that's on the stack + add(rsp, 8); +} + +void JitShader::Compile_CALLC(Instruction instr) { + Compile_EvaluateCondition(instr); + Label b; + jz(b); + Compile_CALL(instr); + L(b); +} + +void JitShader::Compile_CALLU(Instruction instr) { + Compile_UniformCondition(instr); + Label b; + jz(b); + Compile_CALL(instr); + L(b); +} + +void JitShader::Compile_CMP(Instruction instr) { + using Op = Instruction::Common::CompareOpType::Op; + Op op_x = instr.common.compare_op.x; + Op op_y = instr.common.compare_op.y; + + Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); + Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); + + // SSE doesn't have greater-than (GT) or greater-equal (GE) comparison operators. You need to + // emulate them by swapping the lhs and rhs and using LT and LE. NLT and NLE can't be used here + // because they don't match when used with NaNs. + static const u8 cmp[] = {CMP_EQ, CMP_NEQ, CMP_LT, CMP_LE, CMP_LT, CMP_LE}; + + bool invert_op_x = (op_x == Op::GreaterThan || op_x == Op::GreaterEqual); + Xmm lhs_x = invert_op_x ? SRC2 : SRC1; + Xmm rhs_x = invert_op_x ? SRC1 : SRC2; + + if (op_x == op_y) { + // Compare X-component and Y-component together + cmpps(lhs_x, rhs_x, cmp[op_x]); + movq(COND0, lhs_x); + + mov(COND1, COND0); + } else { + bool invert_op_y = (op_y == Op::GreaterThan || op_y == Op::GreaterEqual); + Xmm lhs_y = invert_op_y ? SRC2 : SRC1; + Xmm rhs_y = invert_op_y ? SRC1 : SRC2; + + // Compare X-component + movaps(SCRATCH, lhs_x); + cmpss(SCRATCH, rhs_x, cmp[op_x]); + + // Compare Y-component + cmpps(lhs_y, rhs_y, cmp[op_y]); + + movq(COND0, SCRATCH); + movq(COND1, lhs_y); + } + + shr(COND0.cvt32(), 31); // ignores upper 32 bits in source + shr(COND1, 63); +} + +void JitShader::Compile_MAD(Instruction instr) { + Compile_SwizzleSrc(instr, 1, instr.mad.src1, SRC1); + + if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MADI) { + Compile_SwizzleSrc(instr, 2, instr.mad.src2i, SRC2); + Compile_SwizzleSrc(instr, 3, instr.mad.src3i, SRC3); + } else { + Compile_SwizzleSrc(instr, 2, instr.mad.src2, SRC2); + Compile_SwizzleSrc(instr, 3, instr.mad.src3, SRC3); + } + + Compile_SanitizedMul(SRC1, SRC2, SCRATCH); + addps(SRC1, SRC3); + + Compile_DestEnable(instr, SRC1); +} + +void JitShader::Compile_IF(Instruction instr) { + Compile_Assert(instr.flow_control.dest_offset >= program_counter, + "Backwards if-statements not supported"); + Label l_else, l_endif; + + // Evaluate the "IF" condition + if (instr.opcode.Value() == OpCode::Id::IFU) { + Compile_UniformCondition(instr); + } else if (instr.opcode.Value() == OpCode::Id::IFC) { + Compile_EvaluateCondition(instr); + } + jz(l_else, T_NEAR); + + // Compile the code that corresponds to the condition evaluating as true + Compile_Block(instr.flow_control.dest_offset); + + // If there isn't an "ELSE" condition, we are done here + if (instr.flow_control.num_instructions == 0) { + L(l_else); + return; + } + + jmp(l_endif, T_NEAR); + + L(l_else); + // This code corresponds to the "ELSE" condition + // Comple the code that corresponds to the condition evaluating as false + Compile_Block(instr.flow_control.dest_offset + instr.flow_control.num_instructions); + + L(l_endif); +} + +void JitShader::Compile_LOOP(Instruction instr) { + Compile_Assert(instr.flow_control.dest_offset >= program_counter, + "Backwards loops not supported"); + Compile_Assert(!looping, "Nested loops not supported"); + + looping = true; + + // This decodes the fields from the integer uniform at index instr.flow_control.int_uniform_id. + // The Y (LOOPCOUNT_REG) and Z (LOOPINC) component are kept multiplied by 16 (Left shifted by + // 4 bits) to be used as an offset into the 16-byte vector registers later + size_t offset = ShaderSetup::GetIntUniformOffset(instr.flow_control.int_uniform_id); + mov(LOOPCOUNT, dword[SETUP + offset]); + mov(LOOPCOUNT_REG, LOOPCOUNT); + shr(LOOPCOUNT_REG, 4); + and(LOOPCOUNT_REG, 0xFF0); // Y-component is the start + mov(LOOPINC, LOOPCOUNT); + shr(LOOPINC, 12); + and(LOOPINC, 0xFF0); // Z-component is the incrementer + movzx(LOOPCOUNT, LOOPCOUNT.cvt8()); // X-component is iteration count + add(LOOPCOUNT, 1); // Iteration count is X-component + 1 + + Label l_loop_start; + L(l_loop_start); + + Compile_Block(instr.flow_control.dest_offset + 1); + + add(LOOPCOUNT_REG, LOOPINC); // Increment LOOPCOUNT_REG by Z-component + sub(LOOPCOUNT, 1); // Increment loop count by 1 + jnz(l_loop_start); // Loop if not equal + + looping = false; +} + +void JitShader::Compile_JMP(Instruction instr) { + if (instr.opcode.Value() == OpCode::Id::JMPC) + Compile_EvaluateCondition(instr); + else if (instr.opcode.Value() == OpCode::Id::JMPU) + Compile_UniformCondition(instr); + else + UNREACHABLE(); + + bool inverted_condition = + (instr.opcode.Value() == OpCode::Id::JMPU) && (instr.flow_control.num_instructions & 1); + + Label& b = instruction_labels[instr.flow_control.dest_offset]; + if (inverted_condition) { + jz(b, T_NEAR); + } else { + jnz(b, T_NEAR); + } +} + +void JitShader::Compile_Block(unsigned end) { + while (program_counter < end) { + Compile_NextInstr(); + } +} + +void JitShader::Compile_Return() { + // Peek return offset on the stack and check if we're at that offset + mov(rax, qword[rsp + 8]); + cmp(eax, (program_counter)); + + // If so, jump back to before CALL + Label b; + jnz(b); + ret(); + L(b); +} + +void JitShader::Compile_NextInstr() { + if (std::binary_search(return_offsets.begin(), return_offsets.end(), program_counter)) { + Compile_Return(); + } + + L(instruction_labels[program_counter]); + + Instruction instr = GetVertexShaderInstruction(program_counter++); + + OpCode::Id opcode = instr.opcode.Value(); + auto instr_func = instr_table[static_cast(opcode)]; + + if (instr_func) { + // JIT the instruction! + ((*this).*instr_func)(instr); + } else { + // Unhandled instruction + LOG_CRITICAL(HW_GPU, "Unhandled instruction: 0x%02x (0x%08x)", + instr.opcode.Value().EffectiveOpCode(), instr.hex); + } +} + +void JitShader::FindReturnOffsets() { + return_offsets.clear(); + + for (size_t offset = 0; offset < g_state.vs.program_code.size(); ++offset) { + Instruction instr = GetVertexShaderInstruction(offset); + + switch (instr.opcode.Value()) { + case OpCode::Id::CALL: + case OpCode::Id::CALLC: + case OpCode::Id::CALLU: + return_offsets.push_back(instr.flow_control.dest_offset + + instr.flow_control.num_instructions); + break; + default: + break; + } + } + + // Sort for efficient binary search later + std::sort(return_offsets.begin(), return_offsets.end()); +} + +void JitShader::Compile() { + // Reset flow control state + program = (CompiledShader*)getCurr(); + program_counter = 0; + looping = false; + instruction_labels.fill(Xbyak::Label()); + + // Find all `CALL` instructions and identify return locations + FindReturnOffsets(); + + // The stack pointer is 8 modulo 16 at the entry of a procedure + ABI_PushRegistersAndAdjustStack(*this, ABI_ALL_CALLEE_SAVED, 8); + + mov(SETUP, ABI_PARAM1); + mov(STATE, ABI_PARAM2); + + // Zero address/loop registers + xor(ADDROFFS_REG_0.cvt32(), ADDROFFS_REG_0.cvt32()); + xor(ADDROFFS_REG_1.cvt32(), ADDROFFS_REG_1.cvt32()); + xor(LOOPCOUNT_REG, LOOPCOUNT_REG); + + // Used to set a register to one + static const __m128 one = {1.f, 1.f, 1.f, 1.f}; + mov(rax, reinterpret_cast(&one)); + movaps(ONE, xword[rax]); + + // Used to negate registers + static const __m128 neg = {-0.f, -0.f, -0.f, -0.f}; + mov(rax, reinterpret_cast(&neg)); + movaps(NEGBIT, xword[rax]); + + // Jump to start of the shader program + jmp(ABI_PARAM3); + + // Compile entire program + Compile_Block(static_cast(g_state.vs.program_code.size())); + + // Free memory that's no longer needed + return_offsets.clear(); + return_offsets.shrink_to_fit(); + + ready(); + + uintptr_t size = reinterpret_cast(getCurr()) - reinterpret_cast(program); + ASSERT_MSG(size <= MAX_SHADER_SIZE, "Compiled a shader that exceeds the allocated size!"); + LOG_DEBUG(HW_GPU, "Compiled shader size=%lu", size); +} + +JitShader::JitShader() : Xbyak::CodeGenerator(MAX_SHADER_SIZE) {} + +} // namespace Shader + +} // namespace Pica diff --git a/src/video_core/shader/shader_jit_x64_compiler.h b/src/video_core/shader/shader_jit_x64_compiler.h new file mode 100644 index 000000000..f37548306 --- /dev/null +++ b/src/video_core/shader/shader_jit_x64_compiler.h @@ -0,0 +1,121 @@ +// Copyright 2015 Citra Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include "common/bit_set.h" +#include "common/common_types.h" +#include "common/x64/emitter.h" +#include "video_core/shader/shader.h" + +using nihstro::Instruction; +using nihstro::OpCode; +using nihstro::SwizzlePattern; + +namespace Pica { + +namespace Shader { + +/// Memory allocated for each compiled shader (64Kb) +constexpr size_t MAX_SHADER_SIZE = 1024 * 64; + +/** + * This class implements the shader JIT compiler. It recompiles a Pica shader program into x86_64 + * code that can be executed on the host machine directly. + */ +class JitShader : public Xbyak::CodeGenerator { +public: + JitShader(); + + void Run(const ShaderSetup& setup, UnitState& state, unsigned offset) const { + program(&setup, &state, instruction_labels[offset].getAddress()); + } + + void Compile(); + + void Compile_ADD(Instruction instr); + void Compile_DP3(Instruction instr); + void Compile_DP4(Instruction instr); + void Compile_DPH(Instruction instr); + void Compile_EX2(Instruction instr); + void Compile_LG2(Instruction instr); + void Compile_MUL(Instruction instr); + void Compile_SGE(Instruction instr); + void Compile_SLT(Instruction instr); + void Compile_FLR(Instruction instr); + void Compile_MAX(Instruction instr); + void Compile_MIN(Instruction instr); + void Compile_RCP(Instruction instr); + void Compile_RSQ(Instruction instr); + void Compile_MOVA(Instruction instr); + void Compile_MOV(Instruction instr); + void Compile_NOP(Instruction instr); + void Compile_END(Instruction instr); + void Compile_CALL(Instruction instr); + void Compile_CALLC(Instruction instr); + void Compile_CALLU(Instruction instr); + void Compile_IF(Instruction instr); + void Compile_LOOP(Instruction instr); + void Compile_JMP(Instruction instr); + void Compile_CMP(Instruction instr); + void Compile_MAD(Instruction instr); + +private: + void Compile_Block(unsigned end); + void Compile_NextInstr(); + + void Compile_SwizzleSrc(Instruction instr, unsigned src_num, SourceRegister src_reg, + Xbyak::Xmm dest); + void Compile_DestEnable(Instruction instr, Xbyak::Xmm dest); + + /** + * Compiles a `MUL src1, src2` operation, properly handling the PICA semantics when multiplying + * zero by inf. Clobbers `src2` and `scratch`. + */ + void Compile_SanitizedMul(Xbyak::Xmm src1, Xbyak::Xmm src2, Xbyak::Xmm scratch); + + void Compile_EvaluateCondition(Instruction instr); + void Compile_UniformCondition(Instruction instr); + + /** + * Emits the code to conditionally return from a subroutine envoked by the `CALL` instruction. + */ + void Compile_Return(); + + BitSet32 PersistentCallerSavedRegs(); + + /** + * Assertion evaluated at compile-time, but only triggered if executed at runtime. + * @param msg Message to be logged if the assertion fails. + */ + void Compile_Assert(bool condition, const char* msg); + + /** + * Analyzes the entire shader program for `CALL` instructions before emitting any code, + * identifying the locations where a return needs to be inserted. + */ + void FindReturnOffsets(); + + /// Mapping of Pica VS instructions to pointers in the emitted code + std::array instruction_labels; + + /// Offsets in code where a return needs to be inserted + std::vector return_offsets; + + unsigned program_counter = 0; ///< Offset of the next instruction to decode + bool looping = false; ///< True if compiling a loop, used to check for nested loops + + using CompiledShader = void(const void* setup, void* state, const u8* start_addr); + CompiledShader* program = nullptr; +}; + +} // Shader + +} // Pica -- cgit v1.2.3 From 114d6b2f97eb62c7d8c958ebb391b70b026130f9 Mon Sep 17 00:00:00 2001 From: Yuri Kunde Schlesner Date: Sat, 17 Dec 2016 01:21:16 -0800 Subject: VideoCore/Shader: Split interpreter and JIT into separate ShaderEngines --- src/video_core/CMakeLists.txt | 2 + src/video_core/pica.cpp | 2 +- src/video_core/shader/shader.cpp | 92 ++++------------------------ src/video_core/shader/shader.h | 5 +- src/video_core/shader/shader_interpreter.cpp | 39 ++++++++++-- src/video_core/shader/shader_interpreter.h | 19 +++--- src/video_core/shader/shader_jit_x64.cpp | 56 +++++++++++++++++ src/video_core/shader/shader_jit_x64.h | 35 +++++++++++ 8 files changed, 153 insertions(+), 97 deletions(-) create mode 100644 src/video_core/shader/shader_jit_x64.cpp create mode 100644 src/video_core/shader/shader_jit_x64.h (limited to 'src/video_core') diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt index 36397cce9..d55b84ce0 100644 --- a/src/video_core/CMakeLists.txt +++ b/src/video_core/CMakeLists.txt @@ -50,9 +50,11 @@ set(HEADERS if(ARCHITECTURE_x86_64) set(SRCS ${SRCS} + shader/shader_jit_x64.cpp shader/shader_jit_x64_compiler.cpp) set(HEADERS ${HEADERS} + shader/shader_jit_x64.h shader/shader_jit_x64_compiler.h) endif() diff --git a/src/video_core/pica.cpp b/src/video_core/pica.cpp index ce2bd455e..b4a77c632 100644 --- a/src/video_core/pica.cpp +++ b/src/video_core/pica.cpp @@ -499,7 +499,7 @@ void Init() { } void Shutdown() { - Shader::ClearCache(); + Shader::Shutdown(); } template diff --git a/src/video_core/shader/shader.cpp b/src/video_core/shader/shader.cpp index 97c6519d6..b30dae476 100644 --- a/src/video_core/shader/shader.cpp +++ b/src/video_core/shader/shader.cpp @@ -2,14 +2,8 @@ // Licensed under GPLv2 or any later version // Refer to the license.txt file included. -#include #include #include -#include -#include -#include -#include "common/bit_field.h" -#include "common/hash.h" #include "common/logging/log.h" #include "common/microprofile.h" #include "video_core/pica.h" @@ -17,7 +11,7 @@ #include "video_core/shader/shader.h" #include "video_core/shader/shader_interpreter.h" #ifdef ARCHITECTURE_x86_64 -#include "video_core/shader/shader_jit_x64_compiler.h" +#include "video_core/shader/shader_jit_x64.h" #endif // ARCHITECTURE_x86_64 #include "video_core/video_core.h" @@ -87,93 +81,33 @@ void UnitState::LoadInputVertex(const InputVertex& input, int num_attributes) { conditional_code[1] = false; } -class MergedShaderEngine : public ShaderEngine { -public: - void SetupBatch(const ShaderSetup* setup) override; - void Run(UnitState& state, unsigned int entry_point) const override; - DebugData ProduceDebugInfo(const InputVertex& input, int num_attributes, - unsigned int entry_point) const override; - -private: - const ShaderSetup* setup = nullptr; -}; - -#ifdef ARCHITECTURE_x86_64 -static std::unordered_map> shader_map; -static const JitShader* jit_shader; -#endif // ARCHITECTURE_x86_64 +MICROPROFILE_DEFINE(GPU_Shader, "GPU", "Shader", MP_RGB(50, 50, 240)); -void ClearCache() { #ifdef ARCHITECTURE_x86_64 - shader_map.clear(); +static std::unique_ptr jit_engine; #endif // ARCHITECTURE_x86_64 -} - -void MergedShaderEngine::SetupBatch(const ShaderSetup* setup_) { - setup = setup_; - if (setup == nullptr) - return; +static InterpreterEngine interpreter_engine; +ShaderEngine* GetEngine() { #ifdef ARCHITECTURE_x86_64 + // TODO(yuriks): Re-initialize on each change rather than being persistent if (VideoCore::g_shader_jit_enabled) { - u64 code_hash = Common::ComputeHash64(&setup->program_code, sizeof(setup->program_code)); - u64 swizzle_hash = Common::ComputeHash64(&setup->swizzle_data, sizeof(setup->swizzle_data)); - - u64 cache_key = code_hash ^ swizzle_hash; - auto iter = shader_map.find(cache_key); - if (iter != shader_map.end()) { - jit_shader = iter->second.get(); - } else { - auto shader = std::make_unique(); - shader->Compile(); - jit_shader = shader.get(); - shader_map[cache_key] = std::move(shader); + if (jit_engine == nullptr) { + jit_engine = std::make_unique(); } + return jit_engine.get(); } #endif // ARCHITECTURE_x86_64 -} - -MICROPROFILE_DEFINE(GPU_Shader, "GPU", "Shader", MP_RGB(50, 50, 240)); - -void MergedShaderEngine::Run(UnitState& state, unsigned int entry_point) const { - ASSERT(setup != nullptr); - ASSERT(entry_point < 1024); - MICROPROFILE_SCOPE(GPU_Shader); + return &interpreter_engine; +} +void Shutdown() { #ifdef ARCHITECTURE_x86_64 - if (VideoCore::g_shader_jit_enabled) { - jit_shader->Run(*setup, state, entry_point); - } else { - DebugData dummy_debug_data; - RunInterpreter(*setup, state, dummy_debug_data, entry_point); - } -#else - DebugData dummy_debug_data; - RunInterpreter(*setup, state, dummy_debug_data, entry_point); + jit_engine = nullptr; #endif // ARCHITECTURE_x86_64 } -DebugData MergedShaderEngine::ProduceDebugInfo(const InputVertex& input, int num_attributes, - unsigned int entry_point) const { - ASSERT(setup != nullptr); - ASSERT(entry_point < 1024); - - UnitState state; - DebugData debug_data; - - // Setup input register table - boost::fill(state.registers.input, Math::Vec4::AssignToAll(float24::Zero())); - state.LoadInputVertex(input, num_attributes); - RunInterpreter(*setup, state, debug_data, entry_point); - return debug_data; -} - -ShaderEngine* GetEngine() { - static MergedShaderEngine merged_engine; - return &merged_engine; -} - } // namespace Shader } // namespace Pica diff --git a/src/video_core/shader/shader.h b/src/video_core/shader/shader.h index 899fb2607..2afd1024f 100644 --- a/src/video_core/shader/shader.h +++ b/src/video_core/shader/shader.h @@ -6,7 +6,6 @@ #include #include -#include #include #include #include "common/assert.h" @@ -152,9 +151,6 @@ struct UnitState { void LoadInputVertex(const InputVertex& input, int num_attributes); }; -/// Clears the shader cache -void ClearCache(); - struct ShaderSetup { struct { // The float uniforms are accessed by the shader JIT using SSE instructions, and are @@ -210,6 +206,7 @@ public: // TODO(yuriks): Remove and make it non-global state somewhere ShaderEngine* GetEngine(); +void Shutdown(); } // namespace Shader diff --git a/src/video_core/shader/shader_interpreter.cpp b/src/video_core/shader/shader_interpreter.cpp index 20fb9754b..8e2b8c548 100644 --- a/src/video_core/shader/shader_interpreter.cpp +++ b/src/video_core/shader/shader_interpreter.cpp @@ -7,10 +7,12 @@ #include #include #include +#include #include #include "common/assert.h" #include "common/common_types.h" #include "common/logging/log.h" +#include "common/microprofile.h" #include "common/vector_math.h" #include "video_core/pica_state.h" #include "video_core/pica_types.h" @@ -37,8 +39,8 @@ struct CallStackElement { }; template -void RunInterpreter(const ShaderSetup& setup, UnitState& state, DebugData& debug_data, - unsigned offset) { +static void RunInterpreter(const ShaderSetup& setup, UnitState& state, DebugData& debug_data, + unsigned offset) { // TODO: Is there a maximal size for this? boost::container::static_vector call_stack; u32 program_counter = offset; @@ -647,9 +649,36 @@ void RunInterpreter(const ShaderSetup& setup, UnitState& state, DebugData } } -// Explicit instantiation -template void RunInterpreter(const ShaderSetup&, UnitState&, DebugData&, unsigned offset); -template void RunInterpreter(const ShaderSetup&, UnitState&, DebugData&, unsigned offset); +void InterpreterEngine::SetupBatch(const ShaderSetup* setup_) { + setup = setup_; +} + +MICROPROFILE_DECLARE(GPU_Shader); + +void InterpreterEngine::Run(UnitState& state, unsigned int entry_point) const { + ASSERT(setup != nullptr); + ASSERT(entry_point < 1024); + + MICROPROFILE_SCOPE(GPU_Shader); + + DebugData dummy_debug_data; + RunInterpreter(*setup, state, dummy_debug_data, entry_point); +} + +DebugData InterpreterEngine::ProduceDebugInfo(const InputVertex& input, int num_attributes, + unsigned int entry_point) const { + ASSERT(setup != nullptr); + ASSERT(entry_point < 1024); + + UnitState state; + DebugData debug_data; + + // Setup input register table + boost::fill(state.registers.input, Math::Vec4::AssignToAll(float24::Zero())); + state.LoadInputVertex(input, num_attributes); + RunInterpreter(*setup, state, debug_data, entry_point); + return debug_data; +} } // namespace diff --git a/src/video_core/shader/shader_interpreter.h b/src/video_core/shader/shader_interpreter.h index 3237b50b3..43c1ed5ea 100644 --- a/src/video_core/shader/shader_interpreter.h +++ b/src/video_core/shader/shader_interpreter.h @@ -4,19 +4,22 @@ #pragma once +#include "video_core/shader/shader.h" + namespace Pica { namespace Shader { -struct ShaderSetup; -struct UnitState; - -template -struct DebugData; +class InterpreterEngine final : public ShaderEngine { +public: + void SetupBatch(const ShaderSetup* setup) override; + void Run(UnitState& state, unsigned int entry_point) const override; + DebugData ProduceDebugInfo(const InputVertex& input, int num_attributes, + unsigned int entry_point) const override; -template -void RunInterpreter(const ShaderSetup& setup, UnitState& state, DebugData& debug_data, - unsigned offset); +private: + const ShaderSetup* setup = nullptr; +}; } // namespace diff --git a/src/video_core/shader/shader_jit_x64.cpp b/src/video_core/shader/shader_jit_x64.cpp new file mode 100644 index 000000000..fea79538a --- /dev/null +++ b/src/video_core/shader/shader_jit_x64.cpp @@ -0,0 +1,56 @@ +// Copyright 2016 Citra Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include "common/hash.h" +#include "common/microprofile.h" +#include "video_core/shader/shader.h" +#include "video_core/shader/shader_jit_x64.h" +#include "video_core/shader/shader_jit_x64_compiler.h" + +namespace Pica { +namespace Shader { + +JitX64Engine::JitX64Engine() = default; +JitX64Engine::~JitX64Engine() = default; + +void JitX64Engine::SetupBatch(const ShaderSetup* setup_) { + cached_shader = nullptr; + setup = setup_; + if (setup == nullptr) + return; + + u64 code_hash = Common::ComputeHash64(&setup->program_code, sizeof(setup->program_code)); + u64 swizzle_hash = Common::ComputeHash64(&setup->swizzle_data, sizeof(setup->swizzle_data)); + + u64 cache_key = code_hash ^ swizzle_hash; + auto iter = cache.find(cache_key); + if (iter != cache.end()) { + cached_shader = iter->second.get(); + } else { + auto shader = std::make_unique(); + shader->Compile(); + cached_shader = shader.get(); + cache.emplace_hint(iter, cache_key, std::move(shader)); + } +} + +MICROPROFILE_DECLARE(GPU_Shader); + +void JitX64Engine::Run(UnitState& state, unsigned int entry_point) const { + ASSERT(setup != nullptr); + ASSERT(cached_shader != nullptr); + ASSERT(entry_point < 1024); + + MICROPROFILE_SCOPE(GPU_Shader); + + cached_shader->Run(*setup, state, entry_point); +} + +DebugData JitX64Engine::ProduceDebugInfo(const InputVertex& input, int num_attributes, + unsigned int entry_point) const { + UNIMPLEMENTED_MSG("Shader tracing/debugging is not supported by the JIT."); +} + +} // namespace Shader +} // namespace Pica diff --git a/src/video_core/shader/shader_jit_x64.h b/src/video_core/shader/shader_jit_x64.h new file mode 100644 index 000000000..df18de2c2 --- /dev/null +++ b/src/video_core/shader/shader_jit_x64.h @@ -0,0 +1,35 @@ +// Copyright 2016 Citra Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include +#include +#include "common/common_types.h" +#include "video_core/shader/shader.h" + +namespace Pica { +namespace Shader { + +class JitShader; + +class JitX64Engine final : public ShaderEngine { +public: + JitX64Engine(); + ~JitX64Engine() override; + + void SetupBatch(const ShaderSetup* setup) override; + void Run(UnitState& state, unsigned int entry_point) const override; + DebugData ProduceDebugInfo(const InputVertex& input, int num_attributes, + unsigned int entry_point) const override; + +private: + const ShaderSetup* setup = nullptr; + + std::unordered_map> cache; + const JitShader* cached_shader = nullptr; +}; + +} // namespace Shader +} // namespace Pica -- cgit v1.2.3 From ade7ed7c5fd383e77c4d6949e652e1fd83844233 Mon Sep 17 00:00:00 2001 From: Yuri Kunde Schlesner Date: Sat, 17 Dec 2016 01:30:55 -0800 Subject: VideoCore/Shader: Move ProduceDebugInfo to InterpreterEngine --- src/video_core/shader/shader.h | 11 ----------- src/video_core/shader/shader_interpreter.h | 11 ++++++++++- src/video_core/shader/shader_jit_x64.cpp | 5 ----- src/video_core/shader/shader_jit_x64.h | 2 -- 4 files changed, 10 insertions(+), 19 deletions(-) (limited to 'src/video_core') diff --git a/src/video_core/shader/shader.h b/src/video_core/shader/shader.h index 2afd1024f..9d2410487 100644 --- a/src/video_core/shader/shader.h +++ b/src/video_core/shader/shader.h @@ -14,7 +14,6 @@ #include "common/vector_math.h" #include "video_core/pica.h" #include "video_core/pica_types.h" -#include "video_core/shader/debug_data.h" using nihstro::RegisterType; using nihstro::SourceRegister; @@ -192,16 +191,6 @@ public: * @param state Shader unit state, must be setup per shader and per shader unit */ virtual void Run(UnitState& state, unsigned int entry_point) const = 0; - - /** - * Produce debug information based on the given shader and input vertex - * @param input Input vertex into the shader - * @param num_attributes The number of vertex shader attributes - * @param config Configuration object for the shader pipeline - * @return Debug information for this shader with regards to the given vertex - */ - virtual DebugData ProduceDebugInfo(const InputVertex& input, int num_attributes, - unsigned int entry_point) const = 0; }; // TODO(yuriks): Remove and make it non-global state somewhere diff --git a/src/video_core/shader/shader_interpreter.h b/src/video_core/shader/shader_interpreter.h index 43c1ed5ea..c3691da70 100644 --- a/src/video_core/shader/shader_interpreter.h +++ b/src/video_core/shader/shader_interpreter.h @@ -4,6 +4,7 @@ #pragma once +#include "video_core/shader/debug_data.h" #include "video_core/shader/shader.h" namespace Pica { @@ -14,8 +15,16 @@ class InterpreterEngine final : public ShaderEngine { public: void SetupBatch(const ShaderSetup* setup) override; void Run(UnitState& state, unsigned int entry_point) const override; + + /** + * Produce debug information based on the given shader and input vertex + * @param input Input vertex into the shader + * @param num_attributes The number of vertex shader attributes + * @param config Configuration object for the shader pipeline + * @return Debug information for this shader with regards to the given vertex + */ DebugData ProduceDebugInfo(const InputVertex& input, int num_attributes, - unsigned int entry_point) const override; + unsigned int entry_point) const; private: const ShaderSetup* setup = nullptr; diff --git a/src/video_core/shader/shader_jit_x64.cpp b/src/video_core/shader/shader_jit_x64.cpp index fea79538a..6d83948e1 100644 --- a/src/video_core/shader/shader_jit_x64.cpp +++ b/src/video_core/shader/shader_jit_x64.cpp @@ -47,10 +47,5 @@ void JitX64Engine::Run(UnitState& state, unsigned int entry_point) const { cached_shader->Run(*setup, state, entry_point); } -DebugData JitX64Engine::ProduceDebugInfo(const InputVertex& input, int num_attributes, - unsigned int entry_point) const { - UNIMPLEMENTED_MSG("Shader tracing/debugging is not supported by the JIT."); -} - } // namespace Shader } // namespace Pica diff --git a/src/video_core/shader/shader_jit_x64.h b/src/video_core/shader/shader_jit_x64.h index df18de2c2..b26044477 100644 --- a/src/video_core/shader/shader_jit_x64.h +++ b/src/video_core/shader/shader_jit_x64.h @@ -21,8 +21,6 @@ public: void SetupBatch(const ShaderSetup* setup) override; void Run(UnitState& state, unsigned int entry_point) const override; - DebugData ProduceDebugInfo(const InputVertex& input, int num_attributes, - unsigned int entry_point) const override; private: const ShaderSetup* setup = nullptr; -- cgit v1.2.3 From fa4ac279a77871f45733d43fdecf756ff1e7ece0 Mon Sep 17 00:00:00 2001 From: Yuri Kunde Schlesner Date: Sat, 17 Dec 2016 02:29:22 -0800 Subject: shader_jit_x64: Don't read program from global state --- src/video_core/shader/shader_jit_x64.cpp | 2 +- src/video_core/shader/shader_jit_x64_compiler.cpp | 36 ++++++++++------------- src/video_core/shader/shader_jit_x64_compiler.h | 6 +++- 3 files changed, 22 insertions(+), 22 deletions(-) (limited to 'src/video_core') diff --git a/src/video_core/shader/shader_jit_x64.cpp b/src/video_core/shader/shader_jit_x64.cpp index 6d83948e1..755ae119f 100644 --- a/src/video_core/shader/shader_jit_x64.cpp +++ b/src/video_core/shader/shader_jit_x64.cpp @@ -29,7 +29,7 @@ void JitX64Engine::SetupBatch(const ShaderSetup* setup_) { cached_shader = iter->second.get(); } else { auto shader = std::make_unique(); - shader->Compile(); + shader->Compile(&setup->program_code, &setup->swizzle_data); cached_shader = shader.get(); cache.emplace_hint(iter, cache_key, std::move(shader)); } diff --git a/src/video_core/shader/shader_jit_x64_compiler.cpp b/src/video_core/shader/shader_jit_x64_compiler.cpp index 880543306..49806e8c9 100644 --- a/src/video_core/shader/shader_jit_x64_compiler.cpp +++ b/src/video_core/shader/shader_jit_x64_compiler.cpp @@ -151,15 +151,6 @@ static const u8 NO_SRC_REG_SWIZZLE = 0x1b; /// Raw constant for the destination register enable mask that indicates all components are enabled static const u8 NO_DEST_REG_MASK = 0xf; -/** - * Get the vertex shader instruction for a given offset in the current shader program - * @param offset Offset in the current shader program of the instruction - * @return Instruction at the specified offset - */ -static Instruction GetVertexShaderInstruction(size_t offset) { - return {g_state.vs.program_code[offset]}; -} - static void LogCritical(const char* msg) { LOG_CRITICAL(HW_GPU, "%s", msg); } @@ -233,7 +224,7 @@ void JitShader::Compile_SwizzleSrc(Instruction instr, unsigned src_num, SourceRe movaps(dest, xword[src_ptr + src_offset_disp]); } - SwizzlePattern swiz = {g_state.vs.swizzle_data[operand_desc_id]}; + SwizzlePattern swiz = {(*swizzle_data)[operand_desc_id]}; // Generate instructions for source register swizzling as needed u8 sel = swiz.GetRawSelector(src_num); @@ -264,7 +255,7 @@ void JitShader::Compile_DestEnable(Instruction instr, Xmm src) { dest = instr.common.dest.Value(); } - SwizzlePattern swiz = {g_state.vs.swizzle_data[operand_desc_id]}; + SwizzlePattern swiz = {(*swizzle_data)[operand_desc_id]}; size_t dest_offset_disp = UnitState::OutputOffset(dest); @@ -522,7 +513,7 @@ void JitShader::Compile_MIN(Instruction instr) { } void JitShader::Compile_MOVA(Instruction instr) { - SwizzlePattern swiz = {g_state.vs.swizzle_data[instr.common.operand_desc_id]}; + SwizzlePattern swiz = {(*swizzle_data)[instr.common.operand_desc_id]}; if (!swiz.DestComponentEnabled(0) && !swiz.DestComponentEnabled(1)) { return; // NoOp @@ -796,7 +787,7 @@ void JitShader::Compile_NextInstr() { L(instruction_labels[program_counter]); - Instruction instr = GetVertexShaderInstruction(program_counter++); + Instruction instr = {(*program_code)[program_counter++]}; OpCode::Id opcode = instr.opcode.Value(); auto instr_func = instr_table[static_cast(opcode)]; @@ -814,8 +805,8 @@ void JitShader::Compile_NextInstr() { void JitShader::FindReturnOffsets() { return_offsets.clear(); - for (size_t offset = 0; offset < g_state.vs.program_code.size(); ++offset) { - Instruction instr = GetVertexShaderInstruction(offset); + for (size_t offset = 0; offset < program_code->size(); ++offset) { + Instruction instr = {(*program_code)[offset]}; switch (instr.opcode.Value()) { case OpCode::Id::CALL: @@ -833,7 +824,11 @@ void JitShader::FindReturnOffsets() { std::sort(return_offsets.begin(), return_offsets.end()); } -void JitShader::Compile() { +void JitShader::Compile(const std::array* program_code_, + const std::array* swizzle_data_) { + program_code = program_code_; + swizzle_data = swizzle_data_; + // Reset flow control state program = (CompiledShader*)getCurr(); program_counter = 0; @@ -868,17 +863,18 @@ void JitShader::Compile() { jmp(ABI_PARAM3); // Compile entire program - Compile_Block(static_cast(g_state.vs.program_code.size())); + Compile_Block(static_cast(program_code->size())); // Free memory that's no longer needed + program_code = nullptr; + swizzle_data = nullptr; return_offsets.clear(); return_offsets.shrink_to_fit(); ready(); - uintptr_t size = reinterpret_cast(getCurr()) - reinterpret_cast(program); - ASSERT_MSG(size <= MAX_SHADER_SIZE, "Compiled a shader that exceeds the allocated size!"); - LOG_DEBUG(HW_GPU, "Compiled shader size=%lu", size); + ASSERT_MSG(getSize() <= MAX_SHADER_SIZE, "Compiled a shader that exceeds the allocated size!"); + LOG_DEBUG(HW_GPU, "Compiled shader size=%lu", getSize()); } JitShader::JitShader() : Xbyak::CodeGenerator(MAX_SHADER_SIZE) {} diff --git a/src/video_core/shader/shader_jit_x64_compiler.h b/src/video_core/shader/shader_jit_x64_compiler.h index f37548306..29e9875ea 100644 --- a/src/video_core/shader/shader_jit_x64_compiler.h +++ b/src/video_core/shader/shader_jit_x64_compiler.h @@ -38,7 +38,8 @@ public: program(&setup, &state, instruction_labels[offset].getAddress()); } - void Compile(); + void Compile(const std::array* program_code, + const std::array* swizzle_data); void Compile_ADD(Instruction instr); void Compile_DP3(Instruction instr); @@ -103,6 +104,9 @@ private: */ void FindReturnOffsets(); + const std::array* program_code = nullptr; + const std::array* swizzle_data = nullptr; + /// Mapping of Pica VS instructions to pointers in the emitted code std::array instruction_labels; -- cgit v1.2.3 From 1a2acc3baae01b9469ee97333c2ec4d58c8a0b91 Mon Sep 17 00:00:00 2001 From: Yuri Kunde Schlesner Date: Sat, 17 Dec 2016 02:33:43 -0800 Subject: Shader: Don't read ShaderSetup from global state --- src/video_core/shader/shader_interpreter.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'src/video_core') diff --git a/src/video_core/shader/shader_interpreter.cpp b/src/video_core/shader/shader_interpreter.cpp index 8e2b8c548..d1f11142d 100644 --- a/src/video_core/shader/shader_interpreter.cpp +++ b/src/video_core/shader/shader_interpreter.cpp @@ -75,9 +75,9 @@ static void RunInterpreter(const ShaderSetup& setup, UnitState& state, DebugData } }; - const auto& uniforms = g_state.vs.uniforms; - const auto& swizzle_data = g_state.vs.swizzle_data; - const auto& program_code = g_state.vs.program_code; + const auto& uniforms = setup.uniforms; + const auto& swizzle_data = setup.swizzle_data; + const auto& program_code = setup.program_code; // Placeholder for invalid inputs static float24 dummy_vec4_float24[4]; -- cgit v1.2.3 From 9ea5eacf919c8c257f8c5fda65e5fac2b6adee07 Mon Sep 17 00:00:00 2001 From: Yuri Kunde Schlesner Date: Sat, 17 Dec 2016 14:09:02 -0800 Subject: Shader: Initialize conditional_code in interpreter This doesn't belong in LoadInputVertex because it also happens for non-VS invocations. Since it's not used by the JIT it seems adequate to initialize it in the interpreter which is the only thing that cares about them. --- src/video_core/shader/shader.cpp | 3 --- src/video_core/shader/shader_interpreter.cpp | 3 +++ 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'src/video_core') diff --git a/src/video_core/shader/shader.cpp b/src/video_core/shader/shader.cpp index b30dae476..1662b5d38 100644 --- a/src/video_core/shader/shader.cpp +++ b/src/video_core/shader/shader.cpp @@ -76,9 +76,6 @@ void UnitState::LoadInputVertex(const InputVertex& input, int num_attributes) { for (int i = 0; i < num_attributes; i++) registers.input[attribute_register_map.GetRegisterForAttribute(i)] = input.attr[i]; - - conditional_code[0] = false; - conditional_code[1] = false; } MICROPROFILE_DEFINE(GPU_Shader, "GPU", "Shader", MP_RGB(50, 50, 240)); diff --git a/src/video_core/shader/shader_interpreter.cpp b/src/video_core/shader/shader_interpreter.cpp index d1f11142d..ecc227089 100644 --- a/src/video_core/shader/shader_interpreter.cpp +++ b/src/video_core/shader/shader_interpreter.cpp @@ -45,6 +45,9 @@ static void RunInterpreter(const ShaderSetup& setup, UnitState& state, DebugData boost::container::static_vector call_stack; u32 program_counter = offset; + state.conditional_code[0] = false; + state.conditional_code[1] = false; + auto call = [&program_counter, &call_stack](u32 offset, u32 num_instructions, u32 return_offset, u8 repeat_count, u8 loop_increment) { // -1 to make sure when incrementing the PC we end up at the correct offset -- cgit v1.2.3 From 6fa3687afc97685101f9ee5c65cf98f505980695 Mon Sep 17 00:00:00 2001 From: Yuri Kunde Schlesner Date: Sat, 17 Dec 2016 14:38:03 -0800 Subject: Shader: Remove OutputRegisters struct --- src/video_core/command_processor.cpp | 7 ++++--- src/video_core/shader/shader.cpp | 11 ++++++----- src/video_core/shader/shader.h | 17 +++++------------ src/video_core/shader/shader_interpreter.cpp | 4 ++-- 4 files changed, 17 insertions(+), 22 deletions(-) (limited to 'src/video_core') diff --git a/src/video_core/command_processor.cpp b/src/video_core/command_processor.cpp index 694c9f169..66d19cba0 100644 --- a/src/video_core/command_processor.cpp +++ b/src/video_core/command_processor.cpp @@ -152,8 +152,8 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { Shader::UnitState shader_unit; shader_unit.LoadInputVertex(immediate_input, regs.vs.num_input_attributes + 1); shader_engine->Run(shader_unit, regs.vs.main_offset); - Shader::OutputVertex output_vertex = - shader_unit.output_registers.ToVertex(regs.vs); + auto output_vertex = Shader::OutputVertex::FromRegisters( + shader_unit.registers.output, regs, regs.vs.output_mask); // Send to renderer using Pica::Shader::OutputVertex; @@ -291,7 +291,8 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { shader_engine->Run(shader_unit, regs.vs.main_offset); // Retrieve vertex from register data - output_vertex = shader_unit.output_registers.ToVertex(regs.vs); + output_vertex = Shader::OutputVertex::FromRegisters(shader_unit.registers.output, + regs, regs.vs.output_mask); if (is_indexed) { vertex_cache[vertex_cache_pos] = output_vertex; diff --git a/src/video_core/shader/shader.cpp b/src/video_core/shader/shader.cpp index 1662b5d38..2da50bd62 100644 --- a/src/video_core/shader/shader.cpp +++ b/src/video_core/shader/shader.cpp @@ -19,7 +19,8 @@ namespace Pica { namespace Shader { -OutputVertex OutputRegisters::ToVertex(const Regs::ShaderConfig& config) const { +OutputVertex OutputVertex::FromRegisters(Math::Vec4 output_regs[16], const Regs& regs, + u32 output_mask) { // Setup output data OutputVertex ret; // TODO(neobrain): Under some circumstances, up to 16 attributes may be output. We need to @@ -27,13 +28,13 @@ OutputVertex OutputRegisters::ToVertex(const Regs::ShaderConfig& config) const { unsigned index = 0; for (unsigned i = 0; i < 7; ++i) { - if (index >= g_state.regs.vs_output_total) + if (index >= regs.vs_output_total) break; - if ((config.output_mask & (1 << i)) == 0) + if ((output_mask & (1 << i)) == 0) continue; - const auto& output_register_map = g_state.regs.vs_output_attributes[index]; + const auto& output_register_map = regs.vs_output_attributes[index]; u32 semantics[4] = {output_register_map.map_x, output_register_map.map_y, output_register_map.map_z, output_register_map.map_w}; @@ -41,7 +42,7 @@ OutputVertex OutputRegisters::ToVertex(const Regs::ShaderConfig& config) const { for (unsigned comp = 0; comp < 4; ++comp) { float24* out = ((float24*)&ret) + semantics[comp]; if (semantics[comp] != Regs::VSOutputAttributes::INVALID) { - *out = value[i][comp]; + *out = output_regs[i][comp]; } else { // Zero output so that attributes which aren't output won't have denormals in them, // which would slow us down later. diff --git a/src/video_core/shader/shader.h b/src/video_core/shader/shader.h index 9d2410487..7d51d0044 100644 --- a/src/video_core/shader/shader.h +++ b/src/video_core/shader/shader.h @@ -73,19 +73,13 @@ struct OutputVertex { ret.Lerp(factor, v1); return ret; } + + static OutputVertex FromRegisters(Math::Vec4 output_regs[16], const Regs& regs, + u32 output_mask); }; static_assert(std::is_pod::value, "Structure is not POD"); static_assert(sizeof(OutputVertex) == 32 * sizeof(float), "OutputVertex has invalid size"); -struct OutputRegisters { - OutputRegisters() = default; - - alignas(16) Math::Vec4 value[16]; - - OutputVertex ToVertex(const Regs::ShaderConfig& config) const; -}; -static_assert(std::is_pod::value, "Structure is not POD"); - /** * This structure contains the state information that needs to be unique for a shader unit. The 3DS * has four shader units that process shaders in parallel. At the present, Citra only implements a @@ -98,11 +92,10 @@ struct UnitState { // required to be 16-byte aligned. alignas(16) Math::Vec4 input[16]; alignas(16) Math::Vec4 temporary[16]; + alignas(16) Math::Vec4 output[16]; } registers; static_assert(std::is_pod::value, "Structure is not POD"); - OutputRegisters output_registers; - bool conditional_code[2]; // Two Address registers and one loop counter @@ -128,7 +121,7 @@ struct UnitState { static size_t OutputOffset(const DestRegister& reg) { switch (reg.GetRegisterType()) { case RegisterType::Output: - return offsetof(UnitState, output_registers.value) + + return offsetof(UnitState, registers.output) + reg.GetIndex() * sizeof(Math::Vec4); case RegisterType::Temporary: diff --git a/src/video_core/shader/shader_interpreter.cpp b/src/video_core/shader/shader_interpreter.cpp index ecc227089..a6197c10a 100644 --- a/src/video_core/shader/shader_interpreter.cpp +++ b/src/video_core/shader/shader_interpreter.cpp @@ -175,7 +175,7 @@ static void RunInterpreter(const ShaderSetup& setup, UnitState& state, DebugData float24* dest = (instr.common.dest.Value() < 0x10) - ? &state.output_registers.value[instr.common.dest.Value().GetIndex()][0] + ? &state.registers.output[instr.common.dest.Value().GetIndex()][0] : (instr.common.dest.Value() < 0x20) ? &state.registers.temporary[instr.common.dest.Value().GetIndex()][0] : dummy_vec4_float24; @@ -518,7 +518,7 @@ static void RunInterpreter(const ShaderSetup& setup, UnitState& state, DebugData float24* dest = (instr.mad.dest.Value() < 0x10) - ? &state.output_registers.value[instr.mad.dest.Value().GetIndex()][0] + ? &state.registers.output[instr.mad.dest.Value().GetIndex()][0] : (instr.mad.dest.Value() < 0x20) ? &state.registers.temporary[instr.mad.dest.Value().GetIndex()][0] : dummy_vec4_float24; -- cgit v1.2.3 From 0f642741451e3f75c2f1d64ae9beccaf1437f12c Mon Sep 17 00:00:00 2001 From: Yuri Kunde Schlesner Date: Sat, 17 Dec 2016 16:06:04 -0800 Subject: VideoCore/Shader: Move per-batch ShaderEngine state into ShaderSetup --- src/video_core/command_processor.cpp | 8 ++++---- src/video_core/shader/shader.h | 17 +++++++++++++---- src/video_core/shader/shader_interpreter.cpp | 16 +++++++--------- src/video_core/shader/shader_interpreter.h | 11 ++++------- src/video_core/shader/shader_jit_x64.cpp | 25 ++++++++++--------------- src/video_core/shader/shader_jit_x64.h | 7 ++----- 6 files changed, 40 insertions(+), 44 deletions(-) (limited to 'src/video_core') diff --git a/src/video_core/command_processor.cpp b/src/video_core/command_processor.cpp index 66d19cba0..c3872d06c 100644 --- a/src/video_core/command_processor.cpp +++ b/src/video_core/command_processor.cpp @@ -143,7 +143,7 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { immediate_attribute_id = 0; auto* shader_engine = Shader::GetEngine(); - shader_engine->SetupBatch(&g_state.vs); + shader_engine->SetupBatch(g_state.vs); // Send to vertex shader if (g_debug_context) @@ -151,7 +151,7 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { static_cast(&immediate_input)); Shader::UnitState shader_unit; shader_unit.LoadInputVertex(immediate_input, regs.vs.num_input_attributes + 1); - shader_engine->Run(shader_unit, regs.vs.main_offset); + shader_engine->Run(g_state.vs, shader_unit, regs.vs.main_offset); auto output_vertex = Shader::OutputVertex::FromRegisters( shader_unit.registers.output, regs, regs.vs.output_mask); @@ -248,7 +248,7 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { auto* shader_engine = Shader::GetEngine(); Shader::UnitState shader_unit; - shader_engine->SetupBatch(&g_state.vs); + shader_engine->SetupBatch(g_state.vs); for (unsigned int index = 0; index < regs.num_vertices; ++index) { // Indexed rendering doesn't use the start offset @@ -288,7 +288,7 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { g_debug_context->OnEvent(DebugContext::Event::VertexShaderInvocation, (void*)&input); shader_unit.LoadInputVertex(input, loader.GetNumTotalAttributes()); - shader_engine->Run(shader_unit, regs.vs.main_offset); + shader_engine->Run(g_state.vs, shader_unit, regs.vs.main_offset); // Retrieve vertex from register data output_vertex = Shader::OutputVertex::FromRegisters(shader_unit.registers.output, diff --git a/src/video_core/shader/shader.h b/src/video_core/shader/shader.h index 7d51d0044..f26d2ba4f 100644 --- a/src/video_core/shader/shader.h +++ b/src/video_core/shader/shader.h @@ -167,6 +167,12 @@ struct ShaderSetup { std::array program_code; std::array swizzle_data; + + /// Data private to ShaderEngines + struct EngineData { + /// Used by the JIT, points to a compiled shader object. + const void* cached_shader = nullptr; + } engine_data; }; class ShaderEngine { @@ -177,13 +183,16 @@ public: * Performs any shader unit setup that only needs to happen once per shader (as opposed to once * per vertex, which would happen within the `Run` function). */ - virtual void SetupBatch(const ShaderSetup* setup) = 0; + virtual void SetupBatch(ShaderSetup& setup) = 0; /** - * Runs the currently setup shader - * @param state Shader unit state, must be setup per shader and per shader unit + * Runs the currently setup shader. + * + * @param setup Shader engine state, must be setup with SetupBatch on each shader change. + * @param state Shader unit state, must be setup with input data before each shader invocation. */ - virtual void Run(UnitState& state, unsigned int entry_point) const = 0; + virtual void Run(const ShaderSetup& setup, UnitState& state, + unsigned int entry_point) const = 0; }; // TODO(yuriks): Remove and make it non-global state somewhere diff --git a/src/video_core/shader/shader_interpreter.cpp b/src/video_core/shader/shader_interpreter.cpp index a6197c10a..e44abbf1d 100644 --- a/src/video_core/shader/shader_interpreter.cpp +++ b/src/video_core/shader/shader_interpreter.cpp @@ -652,25 +652,23 @@ static void RunInterpreter(const ShaderSetup& setup, UnitState& state, DebugData } } -void InterpreterEngine::SetupBatch(const ShaderSetup* setup_) { - setup = setup_; -} +void InterpreterEngine::SetupBatch(ShaderSetup& setup) {} MICROPROFILE_DECLARE(GPU_Shader); -void InterpreterEngine::Run(UnitState& state, unsigned int entry_point) const { - ASSERT(setup != nullptr); +void InterpreterEngine::Run(const ShaderSetup& setup, UnitState& state, + unsigned int entry_point) const { ASSERT(entry_point < 1024); MICROPROFILE_SCOPE(GPU_Shader); DebugData dummy_debug_data; - RunInterpreter(*setup, state, dummy_debug_data, entry_point); + RunInterpreter(setup, state, dummy_debug_data, entry_point); } -DebugData InterpreterEngine::ProduceDebugInfo(const InputVertex& input, int num_attributes, +DebugData InterpreterEngine::ProduceDebugInfo(const ShaderSetup& setup, + const InputVertex& input, int num_attributes, unsigned int entry_point) const { - ASSERT(setup != nullptr); ASSERT(entry_point < 1024); UnitState state; @@ -679,7 +677,7 @@ DebugData InterpreterEngine::ProduceDebugInfo(const InputVertex& input, in // Setup input register table boost::fill(state.registers.input, Math::Vec4::AssignToAll(float24::Zero())); state.LoadInputVertex(input, num_attributes); - RunInterpreter(*setup, state, debug_data, entry_point); + RunInterpreter(setup, state, debug_data, entry_point); return debug_data; } diff --git a/src/video_core/shader/shader_interpreter.h b/src/video_core/shader/shader_interpreter.h index c3691da70..7f94d405f 100644 --- a/src/video_core/shader/shader_interpreter.h +++ b/src/video_core/shader/shader_interpreter.h @@ -13,8 +13,8 @@ namespace Shader { class InterpreterEngine final : public ShaderEngine { public: - void SetupBatch(const ShaderSetup* setup) override; - void Run(UnitState& state, unsigned int entry_point) const override; + void SetupBatch(ShaderSetup& setup) override; + void Run(const ShaderSetup& setup, UnitState& state, unsigned int entry_point) const override; /** * Produce debug information based on the given shader and input vertex @@ -23,11 +23,8 @@ public: * @param config Configuration object for the shader pipeline * @return Debug information for this shader with regards to the given vertex */ - DebugData ProduceDebugInfo(const InputVertex& input, int num_attributes, - unsigned int entry_point) const; - -private: - const ShaderSetup* setup = nullptr; + DebugData ProduceDebugInfo(const ShaderSetup& setup, const InputVertex& input, + int num_attributes, unsigned int entry_point) const; }; } // namespace diff --git a/src/video_core/shader/shader_jit_x64.cpp b/src/video_core/shader/shader_jit_x64.cpp index 755ae119f..15c1d60b5 100644 --- a/src/video_core/shader/shader_jit_x64.cpp +++ b/src/video_core/shader/shader_jit_x64.cpp @@ -14,37 +14,32 @@ namespace Shader { JitX64Engine::JitX64Engine() = default; JitX64Engine::~JitX64Engine() = default; -void JitX64Engine::SetupBatch(const ShaderSetup* setup_) { - cached_shader = nullptr; - setup = setup_; - if (setup == nullptr) - return; - - u64 code_hash = Common::ComputeHash64(&setup->program_code, sizeof(setup->program_code)); - u64 swizzle_hash = Common::ComputeHash64(&setup->swizzle_data, sizeof(setup->swizzle_data)); +void JitX64Engine::SetupBatch(ShaderSetup& setup) { + u64 code_hash = Common::ComputeHash64(&setup.program_code, sizeof(setup.program_code)); + u64 swizzle_hash = Common::ComputeHash64(&setup.swizzle_data, sizeof(setup.swizzle_data)); u64 cache_key = code_hash ^ swizzle_hash; auto iter = cache.find(cache_key); if (iter != cache.end()) { - cached_shader = iter->second.get(); + setup.engine_data.cached_shader = iter->second.get(); } else { auto shader = std::make_unique(); - shader->Compile(&setup->program_code, &setup->swizzle_data); - cached_shader = shader.get(); + shader->Compile(&setup.program_code, &setup.swizzle_data); + setup.engine_data.cached_shader = shader.get(); cache.emplace_hint(iter, cache_key, std::move(shader)); } } MICROPROFILE_DECLARE(GPU_Shader); -void JitX64Engine::Run(UnitState& state, unsigned int entry_point) const { - ASSERT(setup != nullptr); - ASSERT(cached_shader != nullptr); +void JitX64Engine::Run(const ShaderSetup& setup, UnitState& state, unsigned int entry_point) const { + ASSERT(setup.engine_data.cached_shader != nullptr); ASSERT(entry_point < 1024); MICROPROFILE_SCOPE(GPU_Shader); - cached_shader->Run(*setup, state, entry_point); + const JitShader* shader = static_cast(setup.engine_data.cached_shader); + shader->Run(setup, state, entry_point); } } // namespace Shader diff --git a/src/video_core/shader/shader_jit_x64.h b/src/video_core/shader/shader_jit_x64.h index b26044477..bd30f51e2 100644 --- a/src/video_core/shader/shader_jit_x64.h +++ b/src/video_core/shader/shader_jit_x64.h @@ -19,14 +19,11 @@ public: JitX64Engine(); ~JitX64Engine() override; - void SetupBatch(const ShaderSetup* setup) override; - void Run(UnitState& state, unsigned int entry_point) const override; + void SetupBatch(ShaderSetup& setup) override; + void Run(const ShaderSetup& setup, UnitState& state, unsigned int entry_point) const override; private: - const ShaderSetup* setup = nullptr; - std::unordered_map> cache; - const JitShader* cached_shader = nullptr; }; } // namespace Shader -- cgit v1.2.3 From 0e9081b97348c65029c96697443acb0dbbc58756 Mon Sep 17 00:00:00 2001 From: Yuri Kunde Schlesner Date: Sat, 17 Dec 2016 16:16:02 -0800 Subject: VideoCore/Shader: Move entry_point to SetupBatch --- src/video_core/command_processor.cpp | 8 ++++---- src/video_core/shader/shader.h | 6 +++--- src/video_core/shader/shader_interpreter.cpp | 19 +++++++++---------- src/video_core/shader/shader_interpreter.h | 6 +++--- src/video_core/shader/shader_jit_x64.cpp | 10 ++++++---- src/video_core/shader/shader_jit_x64.h | 4 ++-- 6 files changed, 27 insertions(+), 26 deletions(-) (limited to 'src/video_core') diff --git a/src/video_core/command_processor.cpp b/src/video_core/command_processor.cpp index c3872d06c..eb79974a8 100644 --- a/src/video_core/command_processor.cpp +++ b/src/video_core/command_processor.cpp @@ -143,7 +143,7 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { immediate_attribute_id = 0; auto* shader_engine = Shader::GetEngine(); - shader_engine->SetupBatch(g_state.vs); + shader_engine->SetupBatch(g_state.vs, regs.vs.main_offset); // Send to vertex shader if (g_debug_context) @@ -151,7 +151,7 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { static_cast(&immediate_input)); Shader::UnitState shader_unit; shader_unit.LoadInputVertex(immediate_input, regs.vs.num_input_attributes + 1); - shader_engine->Run(g_state.vs, shader_unit, regs.vs.main_offset); + shader_engine->Run(g_state.vs, shader_unit); auto output_vertex = Shader::OutputVertex::FromRegisters( shader_unit.registers.output, regs, regs.vs.output_mask); @@ -248,7 +248,7 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { auto* shader_engine = Shader::GetEngine(); Shader::UnitState shader_unit; - shader_engine->SetupBatch(g_state.vs); + shader_engine->SetupBatch(g_state.vs, regs.vs.main_offset); for (unsigned int index = 0; index < regs.num_vertices; ++index) { // Indexed rendering doesn't use the start offset @@ -288,7 +288,7 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { g_debug_context->OnEvent(DebugContext::Event::VertexShaderInvocation, (void*)&input); shader_unit.LoadInputVertex(input, loader.GetNumTotalAttributes()); - shader_engine->Run(g_state.vs, shader_unit, regs.vs.main_offset); + shader_engine->Run(g_state.vs, shader_unit); // Retrieve vertex from register data output_vertex = Shader::OutputVertex::FromRegisters(shader_unit.registers.output, diff --git a/src/video_core/shader/shader.h b/src/video_core/shader/shader.h index f26d2ba4f..44d9f76c3 100644 --- a/src/video_core/shader/shader.h +++ b/src/video_core/shader/shader.h @@ -170,6 +170,7 @@ struct ShaderSetup { /// Data private to ShaderEngines struct EngineData { + unsigned int entry_point; /// Used by the JIT, points to a compiled shader object. const void* cached_shader = nullptr; } engine_data; @@ -183,7 +184,7 @@ public: * Performs any shader unit setup that only needs to happen once per shader (as opposed to once * per vertex, which would happen within the `Run` function). */ - virtual void SetupBatch(ShaderSetup& setup) = 0; + virtual void SetupBatch(ShaderSetup& setup, unsigned int entry_point) = 0; /** * Runs the currently setup shader. @@ -191,8 +192,7 @@ public: * @param setup Shader engine state, must be setup with SetupBatch on each shader change. * @param state Shader unit state, must be setup with input data before each shader invocation. */ - virtual void Run(const ShaderSetup& setup, UnitState& state, - unsigned int entry_point) const = 0; + virtual void Run(const ShaderSetup& setup, UnitState& state) const = 0; }; // TODO(yuriks): Remove and make it non-global state somewhere diff --git a/src/video_core/shader/shader_interpreter.cpp b/src/video_core/shader/shader_interpreter.cpp index e44abbf1d..c0c89b857 100644 --- a/src/video_core/shader/shader_interpreter.cpp +++ b/src/video_core/shader/shader_interpreter.cpp @@ -652,32 +652,31 @@ static void RunInterpreter(const ShaderSetup& setup, UnitState& state, DebugData } } -void InterpreterEngine::SetupBatch(ShaderSetup& setup) {} +void InterpreterEngine::SetupBatch(ShaderSetup& setup, unsigned int entry_point) { + ASSERT(entry_point < 1024); + setup.engine_data.entry_point = entry_point; +} MICROPROFILE_DECLARE(GPU_Shader); -void InterpreterEngine::Run(const ShaderSetup& setup, UnitState& state, - unsigned int entry_point) const { - ASSERT(entry_point < 1024); +void InterpreterEngine::Run(const ShaderSetup& setup, UnitState& state) const { MICROPROFILE_SCOPE(GPU_Shader); DebugData dummy_debug_data; - RunInterpreter(setup, state, dummy_debug_data, entry_point); + RunInterpreter(setup, state, dummy_debug_data, setup.engine_data.entry_point); } DebugData InterpreterEngine::ProduceDebugInfo(const ShaderSetup& setup, - const InputVertex& input, int num_attributes, - unsigned int entry_point) const { - ASSERT(entry_point < 1024); - + const InputVertex& input, + int num_attributes) const { UnitState state; DebugData debug_data; // Setup input register table boost::fill(state.registers.input, Math::Vec4::AssignToAll(float24::Zero())); state.LoadInputVertex(input, num_attributes); - RunInterpreter(setup, state, debug_data, entry_point); + RunInterpreter(setup, state, debug_data, setup.engine_data.entry_point); return debug_data; } diff --git a/src/video_core/shader/shader_interpreter.h b/src/video_core/shader/shader_interpreter.h index 7f94d405f..d6c0e2d8c 100644 --- a/src/video_core/shader/shader_interpreter.h +++ b/src/video_core/shader/shader_interpreter.h @@ -13,8 +13,8 @@ namespace Shader { class InterpreterEngine final : public ShaderEngine { public: - void SetupBatch(ShaderSetup& setup) override; - void Run(const ShaderSetup& setup, UnitState& state, unsigned int entry_point) const override; + void SetupBatch(ShaderSetup& setup, unsigned int entry_point) override; + void Run(const ShaderSetup& setup, UnitState& state) const override; /** * Produce debug information based on the given shader and input vertex @@ -24,7 +24,7 @@ public: * @return Debug information for this shader with regards to the given vertex */ DebugData ProduceDebugInfo(const ShaderSetup& setup, const InputVertex& input, - int num_attributes, unsigned int entry_point) const; + int num_attributes) const; }; } // namespace diff --git a/src/video_core/shader/shader_jit_x64.cpp b/src/video_core/shader/shader_jit_x64.cpp index 15c1d60b5..0ee0dd9ef 100644 --- a/src/video_core/shader/shader_jit_x64.cpp +++ b/src/video_core/shader/shader_jit_x64.cpp @@ -14,7 +14,10 @@ namespace Shader { JitX64Engine::JitX64Engine() = default; JitX64Engine::~JitX64Engine() = default; -void JitX64Engine::SetupBatch(ShaderSetup& setup) { +void JitX64Engine::SetupBatch(ShaderSetup& setup, unsigned int entry_point) { + ASSERT(entry_point < 1024); + setup.engine_data.entry_point = entry_point; + u64 code_hash = Common::ComputeHash64(&setup.program_code, sizeof(setup.program_code)); u64 swizzle_hash = Common::ComputeHash64(&setup.swizzle_data, sizeof(setup.swizzle_data)); @@ -32,14 +35,13 @@ void JitX64Engine::SetupBatch(ShaderSetup& setup) { MICROPROFILE_DECLARE(GPU_Shader); -void JitX64Engine::Run(const ShaderSetup& setup, UnitState& state, unsigned int entry_point) const { +void JitX64Engine::Run(const ShaderSetup& setup, UnitState& state) const { ASSERT(setup.engine_data.cached_shader != nullptr); - ASSERT(entry_point < 1024); MICROPROFILE_SCOPE(GPU_Shader); const JitShader* shader = static_cast(setup.engine_data.cached_shader); - shader->Run(setup, state, entry_point); + shader->Run(setup, state, setup.engine_data.entry_point); } } // namespace Shader diff --git a/src/video_core/shader/shader_jit_x64.h b/src/video_core/shader/shader_jit_x64.h index bd30f51e2..078b2cba5 100644 --- a/src/video_core/shader/shader_jit_x64.h +++ b/src/video_core/shader/shader_jit_x64.h @@ -19,8 +19,8 @@ public: JitX64Engine(); ~JitX64Engine() override; - void SetupBatch(ShaderSetup& setup) override; - void Run(const ShaderSetup& setup, UnitState& state, unsigned int entry_point) const override; + void SetupBatch(ShaderSetup& setup, unsigned int entry_point) override; + void Run(const ShaderSetup& setup, UnitState& state) const override; private: std::unordered_map> cache; -- cgit v1.2.3