diff options
author | James Rowe <jroweboy@gmail.com> | 2018-01-12 04:07:44 +0100 |
---|---|---|
committer | James Rowe <jroweboy@gmail.com> | 2018-01-13 03:11:03 +0100 |
commit | 1d28b2e142f845773e2b90e267d9632e196a99b9 (patch) | |
tree | 027a3586a0fc927731afb3711c328c6dafc8551f /src/video_core/shader | |
parent | Massive removal of unused modules (diff) | |
download | yuzu-1d28b2e142f845773e2b90e267d9632e196a99b9.tar yuzu-1d28b2e142f845773e2b90e267d9632e196a99b9.tar.gz yuzu-1d28b2e142f845773e2b90e267d9632e196a99b9.tar.bz2 yuzu-1d28b2e142f845773e2b90e267d9632e196a99b9.tar.lz yuzu-1d28b2e142f845773e2b90e267d9632e196a99b9.tar.xz yuzu-1d28b2e142f845773e2b90e267d9632e196a99b9.tar.zst yuzu-1d28b2e142f845773e2b90e267d9632e196a99b9.zip |
Diffstat (limited to 'src/video_core/shader')
-rw-r--r-- | src/video_core/shader/debug_data.h | 186 | ||||
-rw-r--r-- | src/video_core/shader/shader.cpp | 154 | ||||
-rw-r--r-- | src/video_core/shader/shader.h | 233 | ||||
-rw-r--r-- | src/video_core/shader/shader_interpreter.cpp | 701 | ||||
-rw-r--r-- | src/video_core/shader/shader_interpreter.h | 32 | ||||
-rw-r--r-- | src/video_core/shader/shader_jit_x64.cpp | 48 | ||||
-rw-r--r-- | src/video_core/shader/shader_jit_x64.h | 30 | ||||
-rw-r--r-- | src/video_core/shader/shader_jit_x64_compiler.cpp | 942 | ||||
-rw-r--r-- | src/video_core/shader/shader_jit_x64_compiler.h | 127 |
9 files changed, 0 insertions, 2453 deletions
diff --git a/src/video_core/shader/debug_data.h b/src/video_core/shader/debug_data.h deleted file mode 100644 index 9e82122e1..000000000 --- a/src/video_core/shader/debug_data.h +++ /dev/null @@ -1,186 +0,0 @@ -// Copyright 2016 Citra Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#pragma once - -#include <vector> -#include "common/common_types.h" -#include "common/vector_math.h" -#include "video_core/pica_types.h" - -namespace Pica { -namespace Shader { - -/// Helper structure used to keep track of data useful for inspection of shader emulation -template <bool full_debugging> -struct DebugData; - -template <> -struct DebugData<false> { - // TODO: Hide these behind and interface and move them to DebugData<true> - u32 max_offset = 0; ///< maximum program counter ever reached - u32 max_opdesc_id = 0; ///< maximum swizzle pattern index ever used -}; - -template <> -struct DebugData<true> { - /// Records store the input and output operands of a particular instruction. - struct Record { - enum Type { - // Floating point arithmetic operands - SRC1 = 0x1, - SRC2 = 0x2, - SRC3 = 0x4, - - // Initial and final output operand value - DEST_IN = 0x8, - DEST_OUT = 0x10, - - // Current and next instruction offset (in words) - CUR_INSTR = 0x20, - NEXT_INSTR = 0x40, - - // Output address register value - ADDR_REG_OUT = 0x80, - - // Result of a comparison instruction - CMP_RESULT = 0x100, - - // Input values for conditional flow control instructions - COND_BOOL_IN = 0x200, - COND_CMP_IN = 0x400, - - // Input values for a loop - LOOP_INT_IN = 0x800, - }; - - Math::Vec4<float24> src1; - Math::Vec4<float24> src2; - Math::Vec4<float24> src3; - - Math::Vec4<float24> dest_in; - Math::Vec4<float24> dest_out; - - s32 address_registers[2]; - bool conditional_code[2]; - bool cond_bool; - bool cond_cmp[2]; - Math::Vec4<u8> loop_int; - - u32 instruction_offset; - u32 next_instruction; - - /// set of enabled fields (as a combination of Type flags) - unsigned mask = 0; - }; - - u32 max_offset = 0; ///< maximum program counter ever reached - u32 max_opdesc_id = 0; ///< maximum swizzle pattern index ever used - - /// List of records for each executed shader instruction - std::vector<DebugData<true>::Record> records; -}; - -/// Type alias for better readability -using DebugDataRecord = DebugData<true>::Record; - -/// Helper function to set a DebugData<true>::Record field based on the template enum parameter. -template <DebugDataRecord::Type type, typename ValueType> -inline void SetField(DebugDataRecord& record, ValueType value); - -template <> -inline void SetField<DebugDataRecord::SRC1>(DebugDataRecord& record, float24* value) { - record.src1.x = value[0]; - record.src1.y = value[1]; - record.src1.z = value[2]; - record.src1.w = value[3]; -} - -template <> -inline void SetField<DebugDataRecord::SRC2>(DebugDataRecord& record, float24* value) { - record.src2.x = value[0]; - record.src2.y = value[1]; - record.src2.z = value[2]; - record.src2.w = value[3]; -} - -template <> -inline void SetField<DebugDataRecord::SRC3>(DebugDataRecord& record, float24* value) { - record.src3.x = value[0]; - record.src3.y = value[1]; - record.src3.z = value[2]; - record.src3.w = value[3]; -} - -template <> -inline void SetField<DebugDataRecord::DEST_IN>(DebugDataRecord& record, float24* value) { - record.dest_in.x = value[0]; - record.dest_in.y = value[1]; - record.dest_in.z = value[2]; - record.dest_in.w = value[3]; -} - -template <> -inline void SetField<DebugDataRecord::DEST_OUT>(DebugDataRecord& record, float24* value) { - record.dest_out.x = value[0]; - record.dest_out.y = value[1]; - record.dest_out.z = value[2]; - record.dest_out.w = value[3]; -} - -template <> -inline void SetField<DebugDataRecord::ADDR_REG_OUT>(DebugDataRecord& record, s32* value) { - record.address_registers[0] = value[0]; - record.address_registers[1] = value[1]; -} - -template <> -inline void SetField<DebugDataRecord::CMP_RESULT>(DebugDataRecord& record, bool* value) { - record.conditional_code[0] = value[0]; - record.conditional_code[1] = value[1]; -} - -template <> -inline void SetField<DebugDataRecord::COND_BOOL_IN>(DebugDataRecord& record, bool value) { - record.cond_bool = value; -} - -template <> -inline void SetField<DebugDataRecord::COND_CMP_IN>(DebugDataRecord& record, bool* value) { - record.cond_cmp[0] = value[0]; - record.cond_cmp[1] = value[1]; -} - -template <> -inline void SetField<DebugDataRecord::LOOP_INT_IN>(DebugDataRecord& record, Math::Vec4<u8> value) { - record.loop_int = value; -} - -template <> -inline void SetField<DebugDataRecord::CUR_INSTR>(DebugDataRecord& record, u32 value) { - record.instruction_offset = value; -} - -template <> -inline void SetField<DebugDataRecord::NEXT_INSTR>(DebugDataRecord& record, u32 value) { - record.next_instruction = value; -} - -/// Helper function to set debug information on the current shader iteration. -template <DebugDataRecord::Type type, typename ValueType> -inline void Record(DebugData<false>& debug_data, u32 offset, ValueType value) { - // Debugging disabled => nothing to do -} - -template <DebugDataRecord::Type type, typename ValueType> -inline void Record(DebugData<true>& debug_data, u32 offset, ValueType value) { - if (offset >= debug_data.records.size()) - debug_data.records.resize(offset + 1); - - SetField<type, ValueType>(debug_data.records[offset], value); - debug_data.records[offset].mask |= type; -} - -} // namespace Shader -} // namespace Pica diff --git a/src/video_core/shader/shader.cpp b/src/video_core/shader/shader.cpp deleted file mode 100644 index 2857d2829..000000000 --- a/src/video_core/shader/shader.cpp +++ /dev/null @@ -1,154 +0,0 @@ -// Copyright 2015 Citra Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#include <cmath> -#include <cstring> -#include "common/bit_set.h" -#include "common/logging/log.h" -#include "common/microprofile.h" -#include "video_core/pica_state.h" -#include "video_core/regs_rasterizer.h" -#include "video_core/regs_shader.h" -#include "video_core/shader/shader.h" -#include "video_core/shader/shader_interpreter.h" -#ifdef ARCHITECTURE_x86_64 -#include "video_core/shader/shader_jit_x64.h" -#endif // ARCHITECTURE_x86_64 -#include "video_core/video_core.h" - -namespace Pica { - -namespace Shader { - -OutputVertex OutputVertex::FromAttributeBuffer(const RasterizerRegs& regs, - const AttributeBuffer& input) { - // Setup output data - union { - OutputVertex ret{}; - std::array<float24, 24> vertex_slots; - }; - static_assert(sizeof(vertex_slots) == sizeof(ret), "Struct and array have different sizes."); - - unsigned int num_attributes = regs.vs_output_total; - ASSERT(num_attributes <= 7); - for (unsigned int i = 0; i < num_attributes; ++i) { - const auto& output_register_map = regs.vs_output_attributes[i]; - - RasterizerRegs::VSOutputAttributes::Semantic semantics[4] = { - output_register_map.map_x, output_register_map.map_y, output_register_map.map_z, - output_register_map.map_w}; - - for (unsigned comp = 0; comp < 4; ++comp) { - RasterizerRegs::VSOutputAttributes::Semantic semantic = semantics[comp]; - if (semantic < vertex_slots.size()) { - vertex_slots[semantic] = input.attr[i][comp]; - } else if (semantic != RasterizerRegs::VSOutputAttributes::INVALID) { - LOG_ERROR(HW_GPU, "Invalid/unknown semantic id: %u", (unsigned int)semantic); - } - } - } - - // The hardware takes the absolute and saturates vertex colors like this, *before* doing - // interpolation - for (unsigned i = 0; i < 4; ++i) { - float c = std::fabs(ret.color[i].ToFloat32()); - ret.color[i] = float24::FromFloat32(c < 1.0f ? c : 1.0f); - } - - LOG_TRACE(HW_GPU, "Output vertex: pos(%.2f, %.2f, %.2f, %.2f), quat(%.2f, %.2f, %.2f, %.2f), " - "col(%.2f, %.2f, %.2f, %.2f), tc0(%.2f, %.2f), view(%.2f, %.2f, %.2f)", - ret.pos.x.ToFloat32(), ret.pos.y.ToFloat32(), ret.pos.z.ToFloat32(), - ret.pos.w.ToFloat32(), ret.quat.x.ToFloat32(), ret.quat.y.ToFloat32(), - ret.quat.z.ToFloat32(), ret.quat.w.ToFloat32(), ret.color.x.ToFloat32(), - ret.color.y.ToFloat32(), ret.color.z.ToFloat32(), ret.color.w.ToFloat32(), - ret.tc0.u().ToFloat32(), ret.tc0.v().ToFloat32(), ret.view.x.ToFloat32(), - ret.view.y.ToFloat32(), ret.view.z.ToFloat32()); - - return ret; -} - -void UnitState::LoadInput(const ShaderRegs& config, const AttributeBuffer& input) { - const unsigned max_attribute = config.max_input_attribute_index; - - for (unsigned attr = 0; attr <= max_attribute; ++attr) { - unsigned reg = config.GetRegisterForAttribute(attr); - registers.input[reg] = input.attr[attr]; - } -} - -void UnitState::WriteOutput(const ShaderRegs& config, AttributeBuffer& output) { - unsigned int output_i = 0; - for (unsigned int reg : Common::BitSet<u32>(config.output_mask)) { - output.attr[output_i++] = registers.output[reg]; - } -} - -UnitState::UnitState(GSEmitter* emitter) : emitter_ptr(emitter) {} - -GSEmitter::GSEmitter() { - handlers = new Handlers; -} - -GSEmitter::~GSEmitter() { - delete handlers; -} - -void GSEmitter::Emit(Math::Vec4<float24> (&vertex)[16]) { - ASSERT(vertex_id < 3); - std::copy(std::begin(vertex), std::end(vertex), buffer[vertex_id].begin()); - if (prim_emit) { - if (winding) - handlers->winding_setter(); - for (size_t i = 0; i < buffer.size(); ++i) { - AttributeBuffer output; - unsigned int output_i = 0; - for (unsigned int reg : Common::BitSet<u32>(output_mask)) { - output.attr[output_i++] = buffer[i][reg]; - } - handlers->vertex_handler(output); - } - } -} - -GSUnitState::GSUnitState() : UnitState(&emitter) {} - -void GSUnitState::SetVertexHandler(VertexHandler vertex_handler, WindingSetter winding_setter) { - emitter.handlers->vertex_handler = std::move(vertex_handler); - emitter.handlers->winding_setter = std::move(winding_setter); -} - -void GSUnitState::ConfigOutput(const ShaderRegs& config) { - emitter.output_mask = config.output_mask; -} - -MICROPROFILE_DEFINE(GPU_Shader, "GPU", "Shader", MP_RGB(50, 50, 240)); - -#ifdef ARCHITECTURE_x86_64 -static std::unique_ptr<JitX64Engine> jit_engine; -#endif // ARCHITECTURE_x86_64 -static InterpreterEngine interpreter_engine; - -ShaderEngine* GetEngine() { -#ifdef ARCHITECTURE_x86_64 - // TODO(yuriks): Re-initialize on each change rather than being persistent - if (VideoCore::g_shader_jit_enabled) { - if (jit_engine == nullptr) { - jit_engine = std::make_unique<JitX64Engine>(); - } - return jit_engine.get(); - } -#endif // ARCHITECTURE_x86_64 - - return &interpreter_engine; -} - -void Shutdown() { -#ifdef ARCHITECTURE_x86_64 - jit_engine = nullptr; -#endif // ARCHITECTURE_x86_64 -} - -} // namespace Shader - -} // namespace Pica diff --git a/src/video_core/shader/shader.h b/src/video_core/shader/shader.h deleted file mode 100644 index a3789da01..000000000 --- a/src/video_core/shader/shader.h +++ /dev/null @@ -1,233 +0,0 @@ -// Copyright 2015 Citra Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#pragma once - -#include <array> -#include <cstddef> -#include <functional> -#include <type_traits> -#include <nihstro/shader_bytecode.h> -#include "common/assert.h" -#include "common/common_funcs.h" -#include "common/common_types.h" -#include "common/vector_math.h" -#include "video_core/pica_types.h" -#include "video_core/regs_rasterizer.h" -#include "video_core/regs_shader.h" - -using nihstro::RegisterType; -using nihstro::SourceRegister; -using nihstro::DestRegister; - -namespace Pica { - -namespace Shader { - -constexpr unsigned MAX_PROGRAM_CODE_LENGTH = 4096; -constexpr unsigned MAX_SWIZZLE_DATA_LENGTH = 4096; - -struct AttributeBuffer { - alignas(16) Math::Vec4<float24> attr[16]; -}; - -/// Handler type for receiving vertex outputs from vertex shader or geometry shader -using VertexHandler = std::function<void(const AttributeBuffer&)>; - -/// Handler type for signaling to invert the vertex order of the next triangle -using WindingSetter = std::function<void()>; - -struct OutputVertex { - Math::Vec4<float24> pos; - Math::Vec4<float24> quat; - Math::Vec4<float24> color; - Math::Vec2<float24> tc0; - Math::Vec2<float24> tc1; - float24 tc0_w; - INSERT_PADDING_WORDS(1); - Math::Vec3<float24> view; - INSERT_PADDING_WORDS(1); - Math::Vec2<float24> tc2; - - static OutputVertex FromAttributeBuffer(const RasterizerRegs& regs, - const AttributeBuffer& output); -}; -#define ASSERT_POS(var, pos) \ - static_assert(offsetof(OutputVertex, var) == pos * sizeof(float24), "Semantic at wrong " \ - "offset.") -ASSERT_POS(pos, RasterizerRegs::VSOutputAttributes::POSITION_X); -ASSERT_POS(quat, RasterizerRegs::VSOutputAttributes::QUATERNION_X); -ASSERT_POS(color, RasterizerRegs::VSOutputAttributes::COLOR_R); -ASSERT_POS(tc0, RasterizerRegs::VSOutputAttributes::TEXCOORD0_U); -ASSERT_POS(tc1, RasterizerRegs::VSOutputAttributes::TEXCOORD1_U); -ASSERT_POS(tc0_w, RasterizerRegs::VSOutputAttributes::TEXCOORD0_W); -ASSERT_POS(view, RasterizerRegs::VSOutputAttributes::VIEW_X); -ASSERT_POS(tc2, RasterizerRegs::VSOutputAttributes::TEXCOORD2_U); -#undef ASSERT_POS -static_assert(std::is_pod<OutputVertex>::value, "Structure is not POD"); -static_assert(sizeof(OutputVertex) == 24 * sizeof(float), "OutputVertex has invalid size"); - -/** - * This structure contains state information for primitive emitting in geometry shader. - */ -struct GSEmitter { - std::array<std::array<Math::Vec4<float24>, 16>, 3> buffer; - u8 vertex_id; - bool prim_emit; - bool winding; - u32 output_mask; - - // Function objects are hidden behind a raw pointer to make the structure standard layout type, - // for JIT to use offsetof to access other members. - struct Handlers { - VertexHandler vertex_handler; - WindingSetter winding_setter; - } * handlers; - - GSEmitter(); - ~GSEmitter(); - void Emit(Math::Vec4<float24> (&vertex)[16]); -}; -static_assert(std::is_standard_layout<GSEmitter>::value, "GSEmitter is not standard layout type"); - -/** - * This structure contains the state information that needs to be unique for a shader unit. The 3DS - * has four shader units that process shaders in parallel. At the present, Citra only implements a - * single shader unit that processes all shaders serially. Putting the state information in a struct - * here will make it easier for us to parallelize the shader processing later. - */ -struct UnitState { - explicit UnitState(GSEmitter* emitter = nullptr); - struct Registers { - // The registers are accessed by the shader JIT using SSE instructions, and are therefore - // required to be 16-byte aligned. - alignas(16) Math::Vec4<float24> input[16]; - alignas(16) Math::Vec4<float24> temporary[16]; - alignas(16) Math::Vec4<float24> output[16]; - } registers; - static_assert(std::is_pod<Registers>::value, "Structure is not POD"); - - bool conditional_code[2]; - - // Two Address registers and one loop counter - // TODO: How many bits do these actually have? - s32 address_registers[3]; - - GSEmitter* emitter_ptr; - - static size_t InputOffset(const SourceRegister& reg) { - switch (reg.GetRegisterType()) { - case RegisterType::Input: - return offsetof(UnitState, registers.input) + - reg.GetIndex() * sizeof(Math::Vec4<float24>); - - case RegisterType::Temporary: - return offsetof(UnitState, registers.temporary) + - reg.GetIndex() * sizeof(Math::Vec4<float24>); - - default: - UNREACHABLE(); - return 0; - } - } - - static size_t OutputOffset(const DestRegister& reg) { - switch (reg.GetRegisterType()) { - case RegisterType::Output: - return offsetof(UnitState, registers.output) + - reg.GetIndex() * sizeof(Math::Vec4<float24>); - - case RegisterType::Temporary: - return offsetof(UnitState, registers.temporary) + - reg.GetIndex() * sizeof(Math::Vec4<float24>); - - default: - UNREACHABLE(); - return 0; - } - } - - /** - * Loads the unit state with an input vertex. - * - * @param config Shader configuration registers corresponding to the unit. - * @param input Attribute buffer to load into the input registers. - */ - void LoadInput(const ShaderRegs& config, const AttributeBuffer& input); - - void WriteOutput(const ShaderRegs& config, AttributeBuffer& output); -}; - -/** - * This is an extended shader unit state that represents the special unit that can run both vertex - * shader and geometry shader. It contains an additional primitive emitter and utilities for - * geometry shader. - */ -struct GSUnitState : public UnitState { - GSUnitState(); - void SetVertexHandler(VertexHandler vertex_handler, WindingSetter winding_setter); - void ConfigOutput(const ShaderRegs& config); - - GSEmitter emitter; -}; - -struct ShaderSetup { - struct { - // The float uniforms are accessed by the shader JIT using SSE instructions, and are - // therefore required to be 16-byte aligned. - alignas(16) Math::Vec4<float24> f[96]; - - std::array<bool, 16> b; - std::array<Math::Vec4<u8>, 4> i; - } uniforms; - - static size_t GetFloatUniformOffset(unsigned index) { - return offsetof(ShaderSetup, uniforms.f) + index * sizeof(Math::Vec4<float24>); - } - - static size_t GetBoolUniformOffset(unsigned index) { - return offsetof(ShaderSetup, uniforms.b) + index * sizeof(bool); - } - - static size_t GetIntUniformOffset(unsigned index) { - return offsetof(ShaderSetup, uniforms.i) + index * sizeof(Math::Vec4<u8>); - } - - std::array<u32, MAX_PROGRAM_CODE_LENGTH> program_code; - std::array<u32, MAX_SWIZZLE_DATA_LENGTH> swizzle_data; - - /// Data private to ShaderEngines - struct EngineData { - unsigned int entry_point; - /// Used by the JIT, points to a compiled shader object. - const void* cached_shader = nullptr; - } engine_data; -}; - -class ShaderEngine { -public: - virtual ~ShaderEngine() = default; - - /** - * Performs any shader unit setup that only needs to happen once per shader (as opposed to once - * per vertex, which would happen within the `Run` function). - */ - virtual void SetupBatch(ShaderSetup& setup, unsigned int entry_point) = 0; - - /** - * Runs the currently setup shader. - * - * @param setup Shader engine state, must be setup with SetupBatch on each shader change. - * @param state Shader unit state, must be setup with input data before each shader invocation. - */ - virtual void Run(const ShaderSetup& setup, UnitState& state) const = 0; -}; - -// TODO(yuriks): Remove and make it non-global state somewhere -ShaderEngine* GetEngine(); -void Shutdown(); - -} // namespace Shader - -} // namespace Pica diff --git a/src/video_core/shader/shader_interpreter.cpp b/src/video_core/shader/shader_interpreter.cpp deleted file mode 100644 index 9d4da4904..000000000 --- a/src/video_core/shader/shader_interpreter.cpp +++ /dev/null @@ -1,701 +0,0 @@ -// Copyright 2014 Citra Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#include <algorithm> -#include <array> -#include <cmath> -#include <numeric> -#include <boost/container/static_vector.hpp> -#include <boost/range/algorithm/fill.hpp> -#include <nihstro/shader_bytecode.h> -#include "common/assert.h" -#include "common/common_types.h" -#include "common/logging/log.h" -#include "common/microprofile.h" -#include "common/vector_math.h" -#include "video_core/pica_state.h" -#include "video_core/pica_types.h" -#include "video_core/shader/shader.h" -#include "video_core/shader/shader_interpreter.h" - -using nihstro::OpCode; -using nihstro::Instruction; -using nihstro::RegisterType; -using nihstro::SourceRegister; -using nihstro::SwizzlePattern; - -namespace Pica { - -namespace Shader { - -struct CallStackElement { - u32 final_address; // Address upon which we jump to return_address - u32 return_address; // Where to jump when leaving scope - u8 repeat_counter; // How often to repeat until this call stack element is removed - u8 loop_increment; // Which value to add to the loop counter after an iteration - // TODO: Should this be a signed value? Does it even matter? - u32 loop_address; // The address where we'll return to after each loop iteration -}; - -template <bool Debug> -static void RunInterpreter(const ShaderSetup& setup, UnitState& state, DebugData<Debug>& debug_data, - unsigned offset) { - // TODO: Is there a maximal size for this? - boost::container::static_vector<CallStackElement, 16> call_stack; - u32 program_counter = offset; - - state.conditional_code[0] = false; - state.conditional_code[1] = false; - - auto call = [&program_counter, &call_stack](u32 offset, u32 num_instructions, u32 return_offset, - u8 repeat_count, u8 loop_increment) { - // -1 to make sure when incrementing the PC we end up at the correct offset - program_counter = offset - 1; - ASSERT(call_stack.size() < call_stack.capacity()); - call_stack.push_back( - {offset + num_instructions, return_offset, repeat_count, loop_increment, offset}); - }; - - auto evaluate_condition = [&state](Instruction::FlowControlType flow_control) { - using Op = Instruction::FlowControlType::Op; - - bool result_x = flow_control.refx.Value() == state.conditional_code[0]; - bool result_y = flow_control.refy.Value() == state.conditional_code[1]; - - switch (flow_control.op) { - case Op::Or: - return result_x || result_y; - case Op::And: - return result_x && result_y; - case Op::JustX: - return result_x; - case Op::JustY: - return result_y; - default: - UNREACHABLE(); - return false; - } - }; - - const auto& uniforms = setup.uniforms; - const auto& swizzle_data = setup.swizzle_data; - const auto& program_code = setup.program_code; - - // Placeholder for invalid inputs - static float24 dummy_vec4_float24[4]; - - unsigned iteration = 0; - bool exit_loop = false; - while (!exit_loop) { - if (!call_stack.empty()) { - auto& top = call_stack.back(); - if (program_counter == top.final_address) { - state.address_registers[2] += top.loop_increment; - - if (top.repeat_counter-- == 0) { - program_counter = top.return_address; - call_stack.pop_back(); - } else { - program_counter = top.loop_address; - } - - // TODO: Is "trying again" accurate to hardware? - continue; - } - } - - const Instruction instr = {program_code[program_counter]}; - const SwizzlePattern swizzle = {swizzle_data[instr.common.operand_desc_id]}; - - Record<DebugDataRecord::CUR_INSTR>(debug_data, iteration, program_counter); - if (iteration > 0) - Record<DebugDataRecord::NEXT_INSTR>(debug_data, iteration - 1, program_counter); - - debug_data.max_offset = std::max<u32>(debug_data.max_offset, 1 + program_counter); - - auto LookupSourceRegister = [&](const SourceRegister& source_reg) -> const float24* { - switch (source_reg.GetRegisterType()) { - case RegisterType::Input: - return &state.registers.input[source_reg.GetIndex()].x; - - case RegisterType::Temporary: - return &state.registers.temporary[source_reg.GetIndex()].x; - - case RegisterType::FloatUniform: - return &uniforms.f[source_reg.GetIndex()].x; - - default: - return dummy_vec4_float24; - } - }; - - switch (instr.opcode.Value().GetInfo().type) { - case OpCode::Type::Arithmetic: { - const bool is_inverted = - (0 != (instr.opcode.Value().GetInfo().subtype & OpCode::Info::SrcInversed)); - - const int address_offset = - (instr.common.address_register_index == 0) - ? 0 - : state.address_registers[instr.common.address_register_index - 1]; - - const float24* src1_ = LookupSourceRegister(instr.common.GetSrc1(is_inverted) + - (is_inverted ? 0 : address_offset)); - const float24* src2_ = LookupSourceRegister(instr.common.GetSrc2(is_inverted) + - (is_inverted ? address_offset : 0)); - - const bool negate_src1 = ((bool)swizzle.negate_src1 != false); - const bool negate_src2 = ((bool)swizzle.negate_src2 != false); - - float24 src1[4] = { - src1_[(int)swizzle.src1_selector_0.Value()], - src1_[(int)swizzle.src1_selector_1.Value()], - src1_[(int)swizzle.src1_selector_2.Value()], - src1_[(int)swizzle.src1_selector_3.Value()], - }; - if (negate_src1) { - src1[0] = -src1[0]; - src1[1] = -src1[1]; - src1[2] = -src1[2]; - src1[3] = -src1[3]; - } - float24 src2[4] = { - src2_[(int)swizzle.src2_selector_0.Value()], - src2_[(int)swizzle.src2_selector_1.Value()], - src2_[(int)swizzle.src2_selector_2.Value()], - src2_[(int)swizzle.src2_selector_3.Value()], - }; - if (negate_src2) { - src2[0] = -src2[0]; - src2[1] = -src2[1]; - src2[2] = -src2[2]; - src2[3] = -src2[3]; - } - - float24* dest = - (instr.common.dest.Value() < 0x10) - ? &state.registers.output[instr.common.dest.Value().GetIndex()][0] - : (instr.common.dest.Value() < 0x20) - ? &state.registers.temporary[instr.common.dest.Value().GetIndex()][0] - : dummy_vec4_float24; - - debug_data.max_opdesc_id = - std::max<u32>(debug_data.max_opdesc_id, 1 + instr.common.operand_desc_id); - - switch (instr.opcode.Value().EffectiveOpCode()) { - case OpCode::Id::ADD: { - Record<DebugDataRecord::SRC1>(debug_data, iteration, src1); - Record<DebugDataRecord::SRC2>(debug_data, iteration, src2); - Record<DebugDataRecord::DEST_IN>(debug_data, iteration, dest); - for (int i = 0; i < 4; ++i) { - if (!swizzle.DestComponentEnabled(i)) - continue; - - dest[i] = src1[i] + src2[i]; - } - Record<DebugDataRecord::DEST_OUT>(debug_data, iteration, dest); - break; - } - - case OpCode::Id::MUL: { - Record<DebugDataRecord::SRC1>(debug_data, iteration, src1); - Record<DebugDataRecord::SRC2>(debug_data, iteration, src2); - Record<DebugDataRecord::DEST_IN>(debug_data, iteration, dest); - for (int i = 0; i < 4; ++i) { - if (!swizzle.DestComponentEnabled(i)) - continue; - - dest[i] = src1[i] * src2[i]; - } - Record<DebugDataRecord::DEST_OUT>(debug_data, iteration, dest); - break; - } - - case OpCode::Id::FLR: - Record<DebugDataRecord::SRC1>(debug_data, iteration, src1); - Record<DebugDataRecord::DEST_IN>(debug_data, iteration, dest); - for (int i = 0; i < 4; ++i) { - if (!swizzle.DestComponentEnabled(i)) - continue; - - dest[i] = float24::FromFloat32(std::floor(src1[i].ToFloat32())); - } - Record<DebugDataRecord::DEST_OUT>(debug_data, iteration, dest); - break; - - case OpCode::Id::MAX: - Record<DebugDataRecord::SRC1>(debug_data, iteration, src1); - Record<DebugDataRecord::SRC2>(debug_data, iteration, src2); - Record<DebugDataRecord::DEST_IN>(debug_data, iteration, dest); - for (int i = 0; i < 4; ++i) { - if (!swizzle.DestComponentEnabled(i)) - continue; - - // NOTE: Exact form required to match NaN semantics to hardware: - // max(0, NaN) -> NaN - // max(NaN, 0) -> 0 - dest[i] = (src1[i] > src2[i]) ? src1[i] : src2[i]; - } - Record<DebugDataRecord::DEST_OUT>(debug_data, iteration, dest); - break; - - case OpCode::Id::MIN: - Record<DebugDataRecord::SRC1>(debug_data, iteration, src1); - Record<DebugDataRecord::SRC2>(debug_data, iteration, src2); - Record<DebugDataRecord::DEST_IN>(debug_data, iteration, dest); - for (int i = 0; i < 4; ++i) { - if (!swizzle.DestComponentEnabled(i)) - continue; - - // NOTE: Exact form required to match NaN semantics to hardware: - // min(0, NaN) -> NaN - // min(NaN, 0) -> 0 - dest[i] = (src1[i] < src2[i]) ? src1[i] : src2[i]; - } - Record<DebugDataRecord::DEST_OUT>(debug_data, iteration, dest); - break; - - case OpCode::Id::DP3: - case OpCode::Id::DP4: - case OpCode::Id::DPH: - case OpCode::Id::DPHI: { - Record<DebugDataRecord::SRC1>(debug_data, iteration, src1); - Record<DebugDataRecord::SRC2>(debug_data, iteration, src2); - Record<DebugDataRecord::DEST_IN>(debug_data, iteration, dest); - - OpCode::Id opcode = instr.opcode.Value().EffectiveOpCode(); - if (opcode == OpCode::Id::DPH || opcode == OpCode::Id::DPHI) - src1[3] = float24::FromFloat32(1.0f); - - int num_components = (opcode == OpCode::Id::DP3) ? 3 : 4; - float24 dot = std::inner_product(src1, src1 + num_components, src2, - float24::FromFloat32(0.f)); - - for (int i = 0; i < 4; ++i) { - if (!swizzle.DestComponentEnabled(i)) - continue; - - dest[i] = dot; - } - Record<DebugDataRecord::DEST_OUT>(debug_data, iteration, dest); - break; - } - - // Reciprocal - case OpCode::Id::RCP: { - Record<DebugDataRecord::SRC1>(debug_data, iteration, src1); - Record<DebugDataRecord::DEST_IN>(debug_data, iteration, dest); - float24 rcp_res = float24::FromFloat32(1.0f / src1[0].ToFloat32()); - for (int i = 0; i < 4; ++i) { - if (!swizzle.DestComponentEnabled(i)) - continue; - - dest[i] = rcp_res; - } - Record<DebugDataRecord::DEST_OUT>(debug_data, iteration, dest); - break; - } - - // Reciprocal Square Root - case OpCode::Id::RSQ: { - Record<DebugDataRecord::SRC1>(debug_data, iteration, src1); - Record<DebugDataRecord::DEST_IN>(debug_data, iteration, dest); - float24 rsq_res = float24::FromFloat32(1.0f / std::sqrt(src1[0].ToFloat32())); - for (int i = 0; i < 4; ++i) { - if (!swizzle.DestComponentEnabled(i)) - continue; - - dest[i] = rsq_res; - } - Record<DebugDataRecord::DEST_OUT>(debug_data, iteration, dest); - break; - } - - case OpCode::Id::MOVA: { - Record<DebugDataRecord::SRC1>(debug_data, iteration, src1); - for (int i = 0; i < 2; ++i) { - if (!swizzle.DestComponentEnabled(i)) - continue; - - // TODO: Figure out how the rounding is done on hardware - state.address_registers[i] = static_cast<s32>(src1[i].ToFloat32()); - } - Record<DebugDataRecord::ADDR_REG_OUT>(debug_data, iteration, - state.address_registers); - break; - } - - case OpCode::Id::MOV: { - Record<DebugDataRecord::SRC1>(debug_data, iteration, src1); - Record<DebugDataRecord::DEST_IN>(debug_data, iteration, dest); - for (int i = 0; i < 4; ++i) { - if (!swizzle.DestComponentEnabled(i)) - continue; - - dest[i] = src1[i]; - } - Record<DebugDataRecord::DEST_OUT>(debug_data, iteration, dest); - break; - } - - case OpCode::Id::SGE: - case OpCode::Id::SGEI: - Record<DebugDataRecord::SRC1>(debug_data, iteration, src1); - Record<DebugDataRecord::SRC2>(debug_data, iteration, src2); - Record<DebugDataRecord::DEST_IN>(debug_data, iteration, dest); - for (int i = 0; i < 4; ++i) { - if (!swizzle.DestComponentEnabled(i)) - continue; - - dest[i] = (src1[i] >= src2[i]) ? float24::FromFloat32(1.0f) - : float24::FromFloat32(0.0f); - } - Record<DebugDataRecord::DEST_OUT>(debug_data, iteration, dest); - break; - - case OpCode::Id::SLT: - case OpCode::Id::SLTI: - Record<DebugDataRecord::SRC1>(debug_data, iteration, src1); - Record<DebugDataRecord::SRC2>(debug_data, iteration, src2); - Record<DebugDataRecord::DEST_IN>(debug_data, iteration, dest); - for (int i = 0; i < 4; ++i) { - if (!swizzle.DestComponentEnabled(i)) - continue; - - dest[i] = (src1[i] < src2[i]) ? float24::FromFloat32(1.0f) - : float24::FromFloat32(0.0f); - } - Record<DebugDataRecord::DEST_OUT>(debug_data, iteration, dest); - break; - - case OpCode::Id::CMP: - Record<DebugDataRecord::SRC1>(debug_data, iteration, src1); - Record<DebugDataRecord::SRC2>(debug_data, iteration, src2); - for (int i = 0; i < 2; ++i) { - // TODO: Can you restrict to one compare via dest masking? - - auto compare_op = instr.common.compare_op; - auto op = (i == 0) ? compare_op.x.Value() : compare_op.y.Value(); - - switch (op) { - case Instruction::Common::CompareOpType::Equal: - state.conditional_code[i] = (src1[i] == src2[i]); - break; - - case Instruction::Common::CompareOpType::NotEqual: - state.conditional_code[i] = (src1[i] != src2[i]); - break; - - case Instruction::Common::CompareOpType::LessThan: - state.conditional_code[i] = (src1[i] < src2[i]); - break; - - case Instruction::Common::CompareOpType::LessEqual: - state.conditional_code[i] = (src1[i] <= src2[i]); - break; - - case Instruction::Common::CompareOpType::GreaterThan: - state.conditional_code[i] = (src1[i] > src2[i]); - break; - - case Instruction::Common::CompareOpType::GreaterEqual: - state.conditional_code[i] = (src1[i] >= src2[i]); - break; - - default: - LOG_ERROR(HW_GPU, "Unknown compare mode %x", static_cast<int>(op)); - break; - } - } - Record<DebugDataRecord::CMP_RESULT>(debug_data, iteration, state.conditional_code); - break; - - case OpCode::Id::EX2: { - Record<DebugDataRecord::SRC1>(debug_data, iteration, src1); - Record<DebugDataRecord::DEST_IN>(debug_data, iteration, dest); - - // EX2 only takes first component exp2 and writes it to all dest components - float24 ex2_res = float24::FromFloat32(std::exp2(src1[0].ToFloat32())); - for (int i = 0; i < 4; ++i) { - if (!swizzle.DestComponentEnabled(i)) - continue; - - dest[i] = ex2_res; - } - - Record<DebugDataRecord::DEST_OUT>(debug_data, iteration, dest); - break; - } - - case OpCode::Id::LG2: { - Record<DebugDataRecord::SRC1>(debug_data, iteration, src1); - Record<DebugDataRecord::DEST_IN>(debug_data, iteration, dest); - - // LG2 only takes the first component log2 and writes it to all dest components - float24 lg2_res = float24::FromFloat32(std::log2(src1[0].ToFloat32())); - for (int i = 0; i < 4; ++i) { - if (!swizzle.DestComponentEnabled(i)) - continue; - - dest[i] = lg2_res; - } - - Record<DebugDataRecord::DEST_OUT>(debug_data, iteration, dest); - break; - } - - default: - LOG_ERROR(HW_GPU, "Unhandled arithmetic instruction: 0x%02x (%s): 0x%08x", - (int)instr.opcode.Value().EffectiveOpCode(), - instr.opcode.Value().GetInfo().name, instr.hex); - DEBUG_ASSERT(false); - break; - } - - break; - } - - case OpCode::Type::MultiplyAdd: { - if ((instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MAD) || - (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MADI)) { - const SwizzlePattern& swizzle = *reinterpret_cast<const SwizzlePattern*>( - &swizzle_data[instr.mad.operand_desc_id]); - - bool is_inverted = (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MADI); - - const int address_offset = - (instr.mad.address_register_index == 0) - ? 0 - : state.address_registers[instr.mad.address_register_index - 1]; - - const float24* src1_ = LookupSourceRegister(instr.mad.GetSrc1(is_inverted)); - const float24* src2_ = LookupSourceRegister(instr.mad.GetSrc2(is_inverted) + - (!is_inverted * address_offset)); - const float24* src3_ = LookupSourceRegister(instr.mad.GetSrc3(is_inverted) + - (is_inverted * address_offset)); - - const bool negate_src1 = ((bool)swizzle.negate_src1 != false); - const bool negate_src2 = ((bool)swizzle.negate_src2 != false); - const bool negate_src3 = ((bool)swizzle.negate_src3 != false); - - float24 src1[4] = { - src1_[(int)swizzle.src1_selector_0.Value()], - src1_[(int)swizzle.src1_selector_1.Value()], - src1_[(int)swizzle.src1_selector_2.Value()], - src1_[(int)swizzle.src1_selector_3.Value()], - }; - if (negate_src1) { - src1[0] = -src1[0]; - src1[1] = -src1[1]; - src1[2] = -src1[2]; - src1[3] = -src1[3]; - } - float24 src2[4] = { - src2_[(int)swizzle.src2_selector_0.Value()], - src2_[(int)swizzle.src2_selector_1.Value()], - src2_[(int)swizzle.src2_selector_2.Value()], - src2_[(int)swizzle.src2_selector_3.Value()], - }; - if (negate_src2) { - src2[0] = -src2[0]; - src2[1] = -src2[1]; - src2[2] = -src2[2]; - src2[3] = -src2[3]; - } - float24 src3[4] = { - src3_[(int)swizzle.src3_selector_0.Value()], - src3_[(int)swizzle.src3_selector_1.Value()], - src3_[(int)swizzle.src3_selector_2.Value()], - src3_[(int)swizzle.src3_selector_3.Value()], - }; - if (negate_src3) { - src3[0] = -src3[0]; - src3[1] = -src3[1]; - src3[2] = -src3[2]; - src3[3] = -src3[3]; - } - - float24* dest = - (instr.mad.dest.Value() < 0x10) - ? &state.registers.output[instr.mad.dest.Value().GetIndex()][0] - : (instr.mad.dest.Value() < 0x20) - ? &state.registers.temporary[instr.mad.dest.Value().GetIndex()][0] - : dummy_vec4_float24; - - Record<DebugDataRecord::SRC1>(debug_data, iteration, src1); - Record<DebugDataRecord::SRC2>(debug_data, iteration, src2); - Record<DebugDataRecord::SRC3>(debug_data, iteration, src3); - Record<DebugDataRecord::DEST_IN>(debug_data, iteration, dest); - for (int i = 0; i < 4; ++i) { - if (!swizzle.DestComponentEnabled(i)) - continue; - - dest[i] = src1[i] * src2[i] + src3[i]; - } - Record<DebugDataRecord::DEST_OUT>(debug_data, iteration, dest); - } else { - LOG_ERROR(HW_GPU, "Unhandled multiply-add instruction: 0x%02x (%s): 0x%08x", - (int)instr.opcode.Value().EffectiveOpCode(), - instr.opcode.Value().GetInfo().name, instr.hex); - } - break; - } - - default: { - // Handle each instruction on its own - switch (instr.opcode.Value()) { - case OpCode::Id::END: - exit_loop = true; - break; - - case OpCode::Id::JMPC: - Record<DebugDataRecord::COND_CMP_IN>(debug_data, iteration, state.conditional_code); - if (evaluate_condition(instr.flow_control)) { - program_counter = instr.flow_control.dest_offset - 1; - } - break; - - case OpCode::Id::JMPU: - Record<DebugDataRecord::COND_BOOL_IN>( - debug_data, iteration, uniforms.b[instr.flow_control.bool_uniform_id]); - - if (uniforms.b[instr.flow_control.bool_uniform_id] == - !(instr.flow_control.num_instructions & 1)) { - program_counter = instr.flow_control.dest_offset - 1; - } - break; - - case OpCode::Id::CALL: - call(instr.flow_control.dest_offset, instr.flow_control.num_instructions, - program_counter + 1, 0, 0); - break; - - case OpCode::Id::CALLU: - Record<DebugDataRecord::COND_BOOL_IN>( - debug_data, iteration, uniforms.b[instr.flow_control.bool_uniform_id]); - if (uniforms.b[instr.flow_control.bool_uniform_id]) { - call(instr.flow_control.dest_offset, instr.flow_control.num_instructions, - program_counter + 1, 0, 0); - } - break; - - case OpCode::Id::CALLC: - Record<DebugDataRecord::COND_CMP_IN>(debug_data, iteration, state.conditional_code); - if (evaluate_condition(instr.flow_control)) { - call(instr.flow_control.dest_offset, instr.flow_control.num_instructions, - program_counter + 1, 0, 0); - } - break; - - case OpCode::Id::NOP: - break; - - case OpCode::Id::IFU: - Record<DebugDataRecord::COND_BOOL_IN>( - debug_data, iteration, uniforms.b[instr.flow_control.bool_uniform_id]); - if (uniforms.b[instr.flow_control.bool_uniform_id]) { - call(program_counter + 1, instr.flow_control.dest_offset - program_counter - 1, - instr.flow_control.dest_offset + instr.flow_control.num_instructions, 0, - 0); - } else { - call(instr.flow_control.dest_offset, instr.flow_control.num_instructions, - instr.flow_control.dest_offset + instr.flow_control.num_instructions, 0, - 0); - } - - break; - - case OpCode::Id::IFC: { - // TODO: Do we need to consider swizzlers here? - - Record<DebugDataRecord::COND_CMP_IN>(debug_data, iteration, state.conditional_code); - if (evaluate_condition(instr.flow_control)) { - call(program_counter + 1, instr.flow_control.dest_offset - program_counter - 1, - instr.flow_control.dest_offset + instr.flow_control.num_instructions, 0, - 0); - } else { - call(instr.flow_control.dest_offset, instr.flow_control.num_instructions, - instr.flow_control.dest_offset + instr.flow_control.num_instructions, 0, - 0); - } - - break; - } - - case OpCode::Id::LOOP: { - Math::Vec4<u8> loop_param(uniforms.i[instr.flow_control.int_uniform_id].x, - uniforms.i[instr.flow_control.int_uniform_id].y, - uniforms.i[instr.flow_control.int_uniform_id].z, - uniforms.i[instr.flow_control.int_uniform_id].w); - state.address_registers[2] = loop_param.y; - - Record<DebugDataRecord::LOOP_INT_IN>(debug_data, iteration, loop_param); - call(program_counter + 1, instr.flow_control.dest_offset - program_counter, - instr.flow_control.dest_offset + 1, loop_param.x, loop_param.z); - break; - } - - case OpCode::Id::EMIT: { - GSEmitter* emitter = state.emitter_ptr; - ASSERT_MSG(emitter, "Execute EMIT on VS"); - emitter->Emit(state.registers.output); - break; - } - - case OpCode::Id::SETEMIT: { - GSEmitter* emitter = state.emitter_ptr; - ASSERT_MSG(emitter, "Execute SETEMIT on VS"); - emitter->vertex_id = instr.setemit.vertex_id; - emitter->prim_emit = instr.setemit.prim_emit != 0; - emitter->winding = instr.setemit.winding != 0; - break; - } - - default: - LOG_ERROR(HW_GPU, "Unhandled instruction: 0x%02x (%s): 0x%08x", - (int)instr.opcode.Value().EffectiveOpCode(), - instr.opcode.Value().GetInfo().name, instr.hex); - break; - } - - break; - } - } - - ++program_counter; - ++iteration; - } -} - -void InterpreterEngine::SetupBatch(ShaderSetup& setup, unsigned int entry_point) { - ASSERT(entry_point < MAX_PROGRAM_CODE_LENGTH); - setup.engine_data.entry_point = entry_point; -} - -MICROPROFILE_DECLARE(GPU_Shader); - -void InterpreterEngine::Run(const ShaderSetup& setup, UnitState& state) const { - - MICROPROFILE_SCOPE(GPU_Shader); - - DebugData<false> dummy_debug_data; - RunInterpreter(setup, state, dummy_debug_data, setup.engine_data.entry_point); -} - -DebugData<true> InterpreterEngine::ProduceDebugInfo(const ShaderSetup& setup, - const AttributeBuffer& input, - const ShaderRegs& config) const { - UnitState state; - DebugData<true> debug_data; - - // Setup input register table - boost::fill(state.registers.input, Math::Vec4<float24>::AssignToAll(float24::Zero())); - state.LoadInput(config, input); - RunInterpreter(setup, state, debug_data, setup.engine_data.entry_point); - return debug_data; -} - -} // namespace - -} // namespace diff --git a/src/video_core/shader/shader_interpreter.h b/src/video_core/shader/shader_interpreter.h deleted file mode 100644 index 50fd7c69d..000000000 --- a/src/video_core/shader/shader_interpreter.h +++ /dev/null @@ -1,32 +0,0 @@ -// Copyright 2014 Citra Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#pragma once - -#include "video_core/shader/debug_data.h" -#include "video_core/shader/shader.h" - -namespace Pica { - -namespace Shader { - -class InterpreterEngine final : public ShaderEngine { -public: - void SetupBatch(ShaderSetup& setup, unsigned int entry_point) override; - void Run(const ShaderSetup& setup, UnitState& state) const override; - - /** - * Produce debug information based on the given shader and input vertex - * @param setup Shader engine state - * @param input Input vertex into the shader - * @param config Configuration object for the shader pipeline - * @return Debug information for this shader with regards to the given vertex - */ - DebugData<true> ProduceDebugInfo(const ShaderSetup& setup, const AttributeBuffer& input, - const ShaderRegs& config) const; -}; - -} // namespace - -} // namespace diff --git a/src/video_core/shader/shader_jit_x64.cpp b/src/video_core/shader/shader_jit_x64.cpp deleted file mode 100644 index 73c21871c..000000000 --- a/src/video_core/shader/shader_jit_x64.cpp +++ /dev/null @@ -1,48 +0,0 @@ -// Copyright 2016 Citra Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#include "common/hash.h" -#include "common/microprofile.h" -#include "video_core/shader/shader.h" -#include "video_core/shader/shader_jit_x64.h" -#include "video_core/shader/shader_jit_x64_compiler.h" - -namespace Pica { -namespace Shader { - -JitX64Engine::JitX64Engine() = default; -JitX64Engine::~JitX64Engine() = default; - -void JitX64Engine::SetupBatch(ShaderSetup& setup, unsigned int entry_point) { - ASSERT(entry_point < MAX_PROGRAM_CODE_LENGTH); - setup.engine_data.entry_point = entry_point; - - u64 code_hash = Common::ComputeHash64(&setup.program_code, sizeof(setup.program_code)); - u64 swizzle_hash = Common::ComputeHash64(&setup.swizzle_data, sizeof(setup.swizzle_data)); - - u64 cache_key = code_hash ^ swizzle_hash; - auto iter = cache.find(cache_key); - if (iter != cache.end()) { - setup.engine_data.cached_shader = iter->second.get(); - } else { - auto shader = std::make_unique<JitShader>(); - shader->Compile(&setup.program_code, &setup.swizzle_data); - setup.engine_data.cached_shader = shader.get(); - cache.emplace_hint(iter, cache_key, std::move(shader)); - } -} - -MICROPROFILE_DECLARE(GPU_Shader); - -void JitX64Engine::Run(const ShaderSetup& setup, UnitState& state) const { - ASSERT(setup.engine_data.cached_shader != nullptr); - - MICROPROFILE_SCOPE(GPU_Shader); - - const JitShader* shader = static_cast<const JitShader*>(setup.engine_data.cached_shader); - shader->Run(setup, state, setup.engine_data.entry_point); -} - -} // namespace Shader -} // namespace Pica diff --git a/src/video_core/shader/shader_jit_x64.h b/src/video_core/shader/shader_jit_x64.h deleted file mode 100644 index 078b2cba5..000000000 --- a/src/video_core/shader/shader_jit_x64.h +++ /dev/null @@ -1,30 +0,0 @@ -// Copyright 2016 Citra Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#pragma once - -#include <memory> -#include <unordered_map> -#include "common/common_types.h" -#include "video_core/shader/shader.h" - -namespace Pica { -namespace Shader { - -class JitShader; - -class JitX64Engine final : public ShaderEngine { -public: - JitX64Engine(); - ~JitX64Engine() override; - - void SetupBatch(ShaderSetup& setup, unsigned int entry_point) override; - void Run(const ShaderSetup& setup, UnitState& state) const override; - -private: - std::unordered_map<u64, std::unique_ptr<JitShader>> cache; -}; - -} // namespace Shader -} // namespace Pica diff --git a/src/video_core/shader/shader_jit_x64_compiler.cpp b/src/video_core/shader/shader_jit_x64_compiler.cpp deleted file mode 100644 index 1b31623bd..000000000 --- a/src/video_core/shader/shader_jit_x64_compiler.cpp +++ /dev/null @@ -1,942 +0,0 @@ -// Copyright 2015 Citra Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#include <algorithm> -#include <cmath> -#include <cstdint> -#include <nihstro/shader_bytecode.h> -#include <smmintrin.h> -#include <xmmintrin.h> -#include "common/assert.h" -#include "common/logging/log.h" -#include "common/vector_math.h" -#include "common/x64/cpu_detect.h" -#include "common/x64/xbyak_abi.h" -#include "common/x64/xbyak_util.h" -#include "video_core/pica_state.h" -#include "video_core/pica_types.h" -#include "video_core/shader/shader.h" -#include "video_core/shader/shader_jit_x64_compiler.h" - -using namespace Common::X64; -using namespace Xbyak::util; -using Xbyak::Label; -using Xbyak::Reg32; -using Xbyak::Reg64; -using Xbyak::Xmm; - -namespace Pica { - -namespace Shader { - -typedef void (JitShader::*JitFunction)(Instruction instr); - -const JitFunction instr_table[64] = { - &JitShader::Compile_ADD, // add - &JitShader::Compile_DP3, // dp3 - &JitShader::Compile_DP4, // dp4 - &JitShader::Compile_DPH, // dph - nullptr, // unknown - &JitShader::Compile_EX2, // ex2 - &JitShader::Compile_LG2, // lg2 - nullptr, // unknown - &JitShader::Compile_MUL, // mul - &JitShader::Compile_SGE, // sge - &JitShader::Compile_SLT, // slt - &JitShader::Compile_FLR, // flr - &JitShader::Compile_MAX, // max - &JitShader::Compile_MIN, // min - &JitShader::Compile_RCP, // rcp - &JitShader::Compile_RSQ, // rsq - nullptr, // unknown - nullptr, // unknown - &JitShader::Compile_MOVA, // mova - &JitShader::Compile_MOV, // mov - nullptr, // unknown - nullptr, // unknown - nullptr, // unknown - nullptr, // unknown - &JitShader::Compile_DPH, // dphi - nullptr, // unknown - &JitShader::Compile_SGE, // sgei - &JitShader::Compile_SLT, // slti - nullptr, // unknown - nullptr, // unknown - nullptr, // unknown - nullptr, // unknown - nullptr, // unknown - &JitShader::Compile_NOP, // nop - &JitShader::Compile_END, // end - nullptr, // break - &JitShader::Compile_CALL, // call - &JitShader::Compile_CALLC, // callc - &JitShader::Compile_CALLU, // callu - &JitShader::Compile_IF, // ifu - &JitShader::Compile_IF, // ifc - &JitShader::Compile_LOOP, // loop - &JitShader::Compile_EMIT, // emit - &JitShader::Compile_SETE, // sete - &JitShader::Compile_JMP, // jmpc - &JitShader::Compile_JMP, // jmpu - &JitShader::Compile_CMP, // cmp - &JitShader::Compile_CMP, // cmp - &JitShader::Compile_MAD, // madi - &JitShader::Compile_MAD, // madi - &JitShader::Compile_MAD, // madi - &JitShader::Compile_MAD, // madi - &JitShader::Compile_MAD, // madi - &JitShader::Compile_MAD, // madi - &JitShader::Compile_MAD, // madi - &JitShader::Compile_MAD, // madi - &JitShader::Compile_MAD, // mad - &JitShader::Compile_MAD, // mad - &JitShader::Compile_MAD, // mad - &JitShader::Compile_MAD, // mad - &JitShader::Compile_MAD, // mad - &JitShader::Compile_MAD, // mad - &JitShader::Compile_MAD, // mad - &JitShader::Compile_MAD, // mad -}; - -// The following is used to alias some commonly used registers. Generally, RAX-RDX and XMM0-XMM3 can -// be used as scratch registers within a compiler function. The other registers have designated -// purposes, as documented below: - -/// Pointer to the uniform memory -static const Reg64 SETUP = r9; -/// The two 32-bit VS address offset registers set by the MOVA instruction -static const Reg64 ADDROFFS_REG_0 = r10; -static const Reg64 ADDROFFS_REG_1 = r11; -/// VS loop count register (Multiplied by 16) -static const Reg32 LOOPCOUNT_REG = r12d; -/// Current VS loop iteration number (we could probably use LOOPCOUNT_REG, but this quicker) -static const Reg32 LOOPCOUNT = esi; -/// Number to increment LOOPCOUNT_REG by on each loop iteration (Multiplied by 16) -static const Reg32 LOOPINC = edi; -/// Result of the previous CMP instruction for the X-component comparison -static const Reg64 COND0 = r13; -/// Result of the previous CMP instruction for the Y-component comparison -static const Reg64 COND1 = r14; -/// Pointer to the UnitState instance for the current VS unit -static const Reg64 STATE = r15; -/// SIMD scratch register -static const Xmm SCRATCH = xmm0; -/// Loaded with the first swizzled source register, otherwise can be used as a scratch register -static const Xmm SRC1 = xmm1; -/// Loaded with the second swizzled source register, otherwise can be used as a scratch register -static const Xmm SRC2 = xmm2; -/// Loaded with the third swizzled source register, otherwise can be used as a scratch register -static const Xmm SRC3 = xmm3; -/// Additional scratch register -static const Xmm SCRATCH2 = xmm4; -/// Constant vector of [1.0f, 1.0f, 1.0f, 1.0f], used to efficiently set a vector to one -static const Xmm ONE = xmm14; -/// Constant vector of [-0.f, -0.f, -0.f, -0.f], used to efficiently negate a vector with XOR -static const Xmm NEGBIT = xmm15; - -// State registers that must not be modified by external functions calls -// Scratch registers, e.g., SRC1 and SCRATCH, have to be saved on the side if needed -static const BitSet32 persistent_regs = BuildRegSet({ - // Pointers to register blocks - SETUP, STATE, - // Cached registers - ADDROFFS_REG_0, ADDROFFS_REG_1, LOOPCOUNT_REG, COND0, COND1, - // Constants - ONE, NEGBIT, - // Loop variables - LOOPCOUNT, LOOPINC, -}); - -/// Raw constant for the source register selector that indicates no swizzling is performed -static const u8 NO_SRC_REG_SWIZZLE = 0x1b; -/// Raw constant for the destination register enable mask that indicates all components are enabled -static const u8 NO_DEST_REG_MASK = 0xf; - -static void LogCritical(const char* msg) { - LOG_CRITICAL(HW_GPU, "%s", msg); -} - -void JitShader::Compile_Assert(bool condition, const char* msg) { - if (!condition) { - mov(ABI_PARAM1, reinterpret_cast<size_t>(msg)); - CallFarFunction(*this, LogCritical); - } -} - -/** - * Loads and swizzles a source register into the specified XMM register. - * @param instr VS instruction, used for determining how to load the source register - * @param src_num Number indicating which source register to load (1 = src1, 2 = src2, 3 = src3) - * @param src_reg SourceRegister object corresponding to the source register to load - * @param dest Destination XMM register to store the loaded, swizzled source register - */ -void JitShader::Compile_SwizzleSrc(Instruction instr, unsigned src_num, SourceRegister src_reg, - Xmm dest) { - Reg64 src_ptr; - size_t src_offset; - - if (src_reg.GetRegisterType() == RegisterType::FloatUniform) { - src_ptr = SETUP; - src_offset = ShaderSetup::GetFloatUniformOffset(src_reg.GetIndex()); - } else { - src_ptr = STATE; - src_offset = UnitState::InputOffset(src_reg); - } - - int src_offset_disp = (int)src_offset; - ASSERT_MSG(src_offset == src_offset_disp, "Source register offset too large for int type"); - - unsigned operand_desc_id; - - const bool is_inverted = - (0 != (instr.opcode.Value().GetInfo().subtype & OpCode::Info::SrcInversed)); - - unsigned address_register_index; - unsigned offset_src; - - if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MAD || - instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MADI) { - operand_desc_id = instr.mad.operand_desc_id; - offset_src = is_inverted ? 3 : 2; - address_register_index = instr.mad.address_register_index; - } else { - operand_desc_id = instr.common.operand_desc_id; - offset_src = is_inverted ? 2 : 1; - address_register_index = instr.common.address_register_index; - } - - if (src_num == offset_src && address_register_index != 0) { - switch (address_register_index) { - case 1: // address offset 1 - movaps(dest, xword[src_ptr + ADDROFFS_REG_0 + src_offset_disp]); - break; - case 2: // address offset 2 - movaps(dest, xword[src_ptr + ADDROFFS_REG_1 + src_offset_disp]); - break; - case 3: // address offset 3 - movaps(dest, xword[src_ptr + LOOPCOUNT_REG.cvt64() + src_offset_disp]); - break; - default: - UNREACHABLE(); - break; - } - } else { - // Load the source - movaps(dest, xword[src_ptr + src_offset_disp]); - } - - SwizzlePattern swiz = {(*swizzle_data)[operand_desc_id]}; - - // Generate instructions for source register swizzling as needed - u8 sel = swiz.GetRawSelector(src_num); - if (sel != NO_SRC_REG_SWIZZLE) { - // Selector component order needs to be reversed for the SHUFPS instruction - sel = ((sel & 0xc0) >> 6) | ((sel & 3) << 6) | ((sel & 0xc) << 2) | ((sel & 0x30) >> 2); - - // Shuffle inputs for swizzle - shufps(dest, dest, sel); - } - - // If the source register should be negated, flip the negative bit using XOR - const bool negate[] = {swiz.negate_src1, swiz.negate_src2, swiz.negate_src3}; - if (negate[src_num - 1]) { - xorps(dest, NEGBIT); - } -} - -void JitShader::Compile_DestEnable(Instruction instr, Xmm src) { - DestRegister dest; - unsigned operand_desc_id; - if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MAD || - instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MADI) { - operand_desc_id = instr.mad.operand_desc_id; - dest = instr.mad.dest.Value(); - } else { - operand_desc_id = instr.common.operand_desc_id; - dest = instr.common.dest.Value(); - } - - SwizzlePattern swiz = {(*swizzle_data)[operand_desc_id]}; - - size_t dest_offset_disp = UnitState::OutputOffset(dest); - - // If all components are enabled, write the result to the destination register - if (swiz.dest_mask == NO_DEST_REG_MASK) { - // Store dest back to memory - movaps(xword[STATE + dest_offset_disp], src); - - } else { - // Not all components are enabled, so mask the result when storing to the destination - // register... - movaps(SCRATCH, xword[STATE + dest_offset_disp]); - - if (Common::GetCPUCaps().sse4_1) { - u8 mask = ((swiz.dest_mask & 1) << 3) | ((swiz.dest_mask & 8) >> 3) | - ((swiz.dest_mask & 2) << 1) | ((swiz.dest_mask & 4) >> 1); - blendps(SCRATCH, src, mask); - } else { - movaps(SCRATCH2, src); - unpckhps(SCRATCH2, SCRATCH); // Unpack X/Y components of source and destination - unpcklps(SCRATCH, src); // Unpack Z/W components of source and destination - - // Compute selector to selectively copy source components to destination for SHUFPS - // instruction - u8 sel = ((swiz.DestComponentEnabled(0) ? 1 : 0) << 0) | - ((swiz.DestComponentEnabled(1) ? 3 : 2) << 2) | - ((swiz.DestComponentEnabled(2) ? 0 : 1) << 4) | - ((swiz.DestComponentEnabled(3) ? 2 : 3) << 6); - shufps(SCRATCH, SCRATCH2, sel); - } - - // Store dest back to memory - movaps(xword[STATE + dest_offset_disp], SCRATCH); - } -} - -void JitShader::Compile_SanitizedMul(Xmm src1, Xmm src2, Xmm scratch) { - // 0 * inf and inf * 0 in the PICA should return 0 instead of NaN. This can be implemented by - // checking for NaNs before and after the multiplication. If the multiplication result is NaN - // where neither source was, this NaN was generated by a 0 * inf multiplication, and so the - // result should be transformed to 0 to match PICA fp rules. - - // Set scratch to mask of (src1 != NaN and src2 != NaN) - movaps(scratch, src1); - cmpordps(scratch, src2); - - mulps(src1, src2); - - // Set src2 to mask of (result == NaN) - movaps(src2, src1); - cmpunordps(src2, src2); - - // Clear components where scratch != src2 (i.e. if result is NaN where neither source was NaN) - xorps(scratch, src2); - andps(src1, scratch); -} - -void JitShader::Compile_EvaluateCondition(Instruction instr) { - // Note: NXOR is used below to check for equality - switch (instr.flow_control.op) { - case Instruction::FlowControlType::Or: - mov(eax, COND0); - mov(ebx, COND1); - xor_(eax, (instr.flow_control.refx.Value() ^ 1)); - xor_(ebx, (instr.flow_control.refy.Value() ^ 1)); - or_(eax, ebx); - break; - - case Instruction::FlowControlType::And: - mov(eax, COND0); - mov(ebx, COND1); - xor_(eax, (instr.flow_control.refx.Value() ^ 1)); - xor_(ebx, (instr.flow_control.refy.Value() ^ 1)); - and_(eax, ebx); - break; - - case Instruction::FlowControlType::JustX: - mov(eax, COND0); - xor_(eax, (instr.flow_control.refx.Value() ^ 1)); - break; - - case Instruction::FlowControlType::JustY: - mov(eax, COND1); - xor_(eax, (instr.flow_control.refy.Value() ^ 1)); - break; - } -} - -void JitShader::Compile_UniformCondition(Instruction instr) { - size_t offset = ShaderSetup::GetBoolUniformOffset(instr.flow_control.bool_uniform_id); - cmp(byte[SETUP + offset], 0); -} - -BitSet32 JitShader::PersistentCallerSavedRegs() { - return persistent_regs & ABI_ALL_CALLER_SAVED; -} - -void JitShader::Compile_ADD(Instruction instr) { - Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); - Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); - addps(SRC1, SRC2); - Compile_DestEnable(instr, SRC1); -} - -void JitShader::Compile_DP3(Instruction instr) { - Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); - Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); - - Compile_SanitizedMul(SRC1, SRC2, SCRATCH); - - movaps(SRC2, SRC1); - shufps(SRC2, SRC2, _MM_SHUFFLE(1, 1, 1, 1)); - - movaps(SRC3, SRC1); - shufps(SRC3, SRC3, _MM_SHUFFLE(2, 2, 2, 2)); - - shufps(SRC1, SRC1, _MM_SHUFFLE(0, 0, 0, 0)); - addps(SRC1, SRC2); - addps(SRC1, SRC3); - - Compile_DestEnable(instr, SRC1); -} - -void JitShader::Compile_DP4(Instruction instr) { - Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); - Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); - - Compile_SanitizedMul(SRC1, SRC2, SCRATCH); - - movaps(SRC2, SRC1); - shufps(SRC1, SRC1, _MM_SHUFFLE(2, 3, 0, 1)); // XYZW -> ZWXY - addps(SRC1, SRC2); - - movaps(SRC2, SRC1); - shufps(SRC1, SRC1, _MM_SHUFFLE(0, 1, 2, 3)); // XYZW -> WZYX - addps(SRC1, SRC2); - - Compile_DestEnable(instr, SRC1); -} - -void JitShader::Compile_DPH(Instruction instr) { - if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::DPHI) { - Compile_SwizzleSrc(instr, 1, instr.common.src1i, SRC1); - Compile_SwizzleSrc(instr, 2, instr.common.src2i, SRC2); - } else { - Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); - Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); - } - - if (Common::GetCPUCaps().sse4_1) { - // Set 4th component to 1.0 - blendps(SRC1, ONE, 0b1000); - } else { - // Set 4th component to 1.0 - movaps(SCRATCH, SRC1); - unpckhps(SCRATCH, ONE); // XYZW, 1111 -> Z1__ - unpcklpd(SRC1, SCRATCH); // XYZW, Z1__ -> XYZ1 - } - - Compile_SanitizedMul(SRC1, SRC2, SCRATCH); - - movaps(SRC2, SRC1); - shufps(SRC1, SRC1, _MM_SHUFFLE(2, 3, 0, 1)); // XYZW -> ZWXY - addps(SRC1, SRC2); - - movaps(SRC2, SRC1); - shufps(SRC1, SRC1, _MM_SHUFFLE(0, 1, 2, 3)); // XYZW -> WZYX - addps(SRC1, SRC2); - - Compile_DestEnable(instr, SRC1); -} - -void JitShader::Compile_EX2(Instruction instr) { - Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); - movss(xmm0, SRC1); // ABI_PARAM1 - - ABI_PushRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0); - CallFarFunction(*this, exp2f); - ABI_PopRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0); - - shufps(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); // ABI_RETURN - movaps(SRC1, xmm0); - Compile_DestEnable(instr, SRC1); -} - -void JitShader::Compile_LG2(Instruction instr) { - Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); - movss(xmm0, SRC1); // ABI_PARAM1 - - ABI_PushRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0); - CallFarFunction(*this, log2f); - ABI_PopRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0); - - shufps(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); // ABI_RETURN - movaps(SRC1, xmm0); - Compile_DestEnable(instr, SRC1); -} - -void JitShader::Compile_MUL(Instruction instr) { - Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); - Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); - Compile_SanitizedMul(SRC1, SRC2, SCRATCH); - Compile_DestEnable(instr, SRC1); -} - -void JitShader::Compile_SGE(Instruction instr) { - if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::SGEI) { - Compile_SwizzleSrc(instr, 1, instr.common.src1i, SRC1); - Compile_SwizzleSrc(instr, 2, instr.common.src2i, SRC2); - } else { - Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); - Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); - } - - cmpleps(SRC2, SRC1); - andps(SRC2, ONE); - - Compile_DestEnable(instr, SRC2); -} - -void JitShader::Compile_SLT(Instruction instr) { - if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::SLTI) { - Compile_SwizzleSrc(instr, 1, instr.common.src1i, SRC1); - Compile_SwizzleSrc(instr, 2, instr.common.src2i, SRC2); - } else { - Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); - Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); - } - - cmpltps(SRC1, SRC2); - andps(SRC1, ONE); - - Compile_DestEnable(instr, SRC1); -} - -void JitShader::Compile_FLR(Instruction instr) { - Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); - - if (Common::GetCPUCaps().sse4_1) { - roundps(SRC1, SRC1, _MM_FROUND_FLOOR); - } else { - cvttps2dq(SRC1, SRC1); - cvtdq2ps(SRC1, SRC1); - } - - Compile_DestEnable(instr, SRC1); -} - -void JitShader::Compile_MAX(Instruction instr) { - Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); - Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); - // SSE semantics match PICA200 ones: In case of NaN, SRC2 is returned. - maxps(SRC1, SRC2); - Compile_DestEnable(instr, SRC1); -} - -void JitShader::Compile_MIN(Instruction instr) { - Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); - Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); - // SSE semantics match PICA200 ones: In case of NaN, SRC2 is returned. - minps(SRC1, SRC2); - Compile_DestEnable(instr, SRC1); -} - -void JitShader::Compile_MOVA(Instruction instr) { - SwizzlePattern swiz = {(*swizzle_data)[instr.common.operand_desc_id]}; - - if (!swiz.DestComponentEnabled(0) && !swiz.DestComponentEnabled(1)) { - return; // NoOp - } - - Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); - - // Convert floats to integers using truncation (only care about X and Y components) - cvttps2dq(SRC1, SRC1); - - // Get result - movq(rax, SRC1); - - // Handle destination enable - if (swiz.DestComponentEnabled(0) && swiz.DestComponentEnabled(1)) { - // Move and sign-extend low 32 bits - movsxd(ADDROFFS_REG_0, eax); - - // Move and sign-extend high 32 bits - shr(rax, 32); - movsxd(ADDROFFS_REG_1, eax); - - // Multiply by 16 to be used as an offset later - shl(ADDROFFS_REG_0, 4); - shl(ADDROFFS_REG_1, 4); - } else { - if (swiz.DestComponentEnabled(0)) { - // Move and sign-extend low 32 bits - movsxd(ADDROFFS_REG_0, eax); - - // Multiply by 16 to be used as an offset later - shl(ADDROFFS_REG_0, 4); - } else if (swiz.DestComponentEnabled(1)) { - // Move and sign-extend high 32 bits - shr(rax, 32); - movsxd(ADDROFFS_REG_1, eax); - - // Multiply by 16 to be used as an offset later - shl(ADDROFFS_REG_1, 4); - } - } -} - -void JitShader::Compile_MOV(Instruction instr) { - Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); - Compile_DestEnable(instr, SRC1); -} - -void JitShader::Compile_RCP(Instruction instr) { - Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); - - // TODO(bunnei): RCPSS is a pretty rough approximation, this might cause problems if Pica - // performs this operation more accurately. This should be checked on hardware. - rcpss(SRC1, SRC1); - shufps(SRC1, SRC1, _MM_SHUFFLE(0, 0, 0, 0)); // XYWZ -> XXXX - - Compile_DestEnable(instr, SRC1); -} - -void JitShader::Compile_RSQ(Instruction instr) { - Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); - - // TODO(bunnei): RSQRTSS is a pretty rough approximation, this might cause problems if Pica - // performs this operation more accurately. This should be checked on hardware. - rsqrtss(SRC1, SRC1); - shufps(SRC1, SRC1, _MM_SHUFFLE(0, 0, 0, 0)); // XYWZ -> XXXX - - Compile_DestEnable(instr, SRC1); -} - -void JitShader::Compile_NOP(Instruction instr) {} - -void JitShader::Compile_END(Instruction instr) { - ABI_PopRegistersAndAdjustStack(*this, ABI_ALL_CALLEE_SAVED, 8, 16); - ret(); -} - -void JitShader::Compile_CALL(Instruction instr) { - // Push offset of the return - push(qword, (instr.flow_control.dest_offset + instr.flow_control.num_instructions)); - - // Call the subroutine - call(instruction_labels[instr.flow_control.dest_offset]); - - // Skip over the return offset that's on the stack - add(rsp, 8); -} - -void JitShader::Compile_CALLC(Instruction instr) { - Compile_EvaluateCondition(instr); - Label b; - jz(b); - Compile_CALL(instr); - L(b); -} - -void JitShader::Compile_CALLU(Instruction instr) { - Compile_UniformCondition(instr); - Label b; - jz(b); - Compile_CALL(instr); - L(b); -} - -void JitShader::Compile_CMP(Instruction instr) { - using Op = Instruction::Common::CompareOpType::Op; - Op op_x = instr.common.compare_op.x; - Op op_y = instr.common.compare_op.y; - - Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); - Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); - - // SSE doesn't have greater-than (GT) or greater-equal (GE) comparison operators. You need to - // emulate them by swapping the lhs and rhs and using LT and LE. NLT and NLE can't be used here - // because they don't match when used with NaNs. - static const u8 cmp[] = {CMP_EQ, CMP_NEQ, CMP_LT, CMP_LE, CMP_LT, CMP_LE}; - - bool invert_op_x = (op_x == Op::GreaterThan || op_x == Op::GreaterEqual); - Xmm lhs_x = invert_op_x ? SRC2 : SRC1; - Xmm rhs_x = invert_op_x ? SRC1 : SRC2; - - if (op_x == op_y) { - // Compare X-component and Y-component together - cmpps(lhs_x, rhs_x, cmp[op_x]); - movq(COND0, lhs_x); - - mov(COND1, COND0); - } else { - bool invert_op_y = (op_y == Op::GreaterThan || op_y == Op::GreaterEqual); - Xmm lhs_y = invert_op_y ? SRC2 : SRC1; - Xmm rhs_y = invert_op_y ? SRC1 : SRC2; - - // Compare X-component - movaps(SCRATCH, lhs_x); - cmpss(SCRATCH, rhs_x, cmp[op_x]); - - // Compare Y-component - cmpps(lhs_y, rhs_y, cmp[op_y]); - - movq(COND0, SCRATCH); - movq(COND1, lhs_y); - } - - shr(COND0.cvt32(), 31); // ignores upper 32 bits in source - shr(COND1, 63); -} - -void JitShader::Compile_MAD(Instruction instr) { - Compile_SwizzleSrc(instr, 1, instr.mad.src1, SRC1); - - if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MADI) { - Compile_SwizzleSrc(instr, 2, instr.mad.src2i, SRC2); - Compile_SwizzleSrc(instr, 3, instr.mad.src3i, SRC3); - } else { - Compile_SwizzleSrc(instr, 2, instr.mad.src2, SRC2); - Compile_SwizzleSrc(instr, 3, instr.mad.src3, SRC3); - } - - Compile_SanitizedMul(SRC1, SRC2, SCRATCH); - addps(SRC1, SRC3); - - Compile_DestEnable(instr, SRC1); -} - -void JitShader::Compile_IF(Instruction instr) { - Compile_Assert(instr.flow_control.dest_offset >= program_counter, - "Backwards if-statements not supported"); - Label l_else, l_endif; - - // Evaluate the "IF" condition - if (instr.opcode.Value() == OpCode::Id::IFU) { - Compile_UniformCondition(instr); - } else if (instr.opcode.Value() == OpCode::Id::IFC) { - Compile_EvaluateCondition(instr); - } - jz(l_else, T_NEAR); - - // Compile the code that corresponds to the condition evaluating as true - Compile_Block(instr.flow_control.dest_offset); - - // If there isn't an "ELSE" condition, we are done here - if (instr.flow_control.num_instructions == 0) { - L(l_else); - return; - } - - jmp(l_endif, T_NEAR); - - L(l_else); - // This code corresponds to the "ELSE" condition - // Comple the code that corresponds to the condition evaluating as false - Compile_Block(instr.flow_control.dest_offset + instr.flow_control.num_instructions); - - L(l_endif); -} - -void JitShader::Compile_LOOP(Instruction instr) { - Compile_Assert(instr.flow_control.dest_offset >= program_counter, - "Backwards loops not supported"); - Compile_Assert(!looping, "Nested loops not supported"); - - looping = true; - - // This decodes the fields from the integer uniform at index instr.flow_control.int_uniform_id. - // The Y (LOOPCOUNT_REG) and Z (LOOPINC) component are kept multiplied by 16 (Left shifted by - // 4 bits) to be used as an offset into the 16-byte vector registers later - size_t offset = ShaderSetup::GetIntUniformOffset(instr.flow_control.int_uniform_id); - mov(LOOPCOUNT, dword[SETUP + offset]); - mov(LOOPCOUNT_REG, LOOPCOUNT); - shr(LOOPCOUNT_REG, 4); - and_(LOOPCOUNT_REG, 0xFF0); // Y-component is the start - mov(LOOPINC, LOOPCOUNT); - shr(LOOPINC, 12); - and_(LOOPINC, 0xFF0); // Z-component is the incrementer - movzx(LOOPCOUNT, LOOPCOUNT.cvt8()); // X-component is iteration count - add(LOOPCOUNT, 1); // Iteration count is X-component + 1 - - Label l_loop_start; - L(l_loop_start); - - Compile_Block(instr.flow_control.dest_offset + 1); - - add(LOOPCOUNT_REG, LOOPINC); // Increment LOOPCOUNT_REG by Z-component - sub(LOOPCOUNT, 1); // Increment loop count by 1 - jnz(l_loop_start); // Loop if not equal - - looping = false; -} - -void JitShader::Compile_JMP(Instruction instr) { - if (instr.opcode.Value() == OpCode::Id::JMPC) - Compile_EvaluateCondition(instr); - else if (instr.opcode.Value() == OpCode::Id::JMPU) - Compile_UniformCondition(instr); - else - UNREACHABLE(); - - bool inverted_condition = - (instr.opcode.Value() == OpCode::Id::JMPU) && (instr.flow_control.num_instructions & 1); - - Label& b = instruction_labels[instr.flow_control.dest_offset]; - if (inverted_condition) { - jz(b, T_NEAR); - } else { - jnz(b, T_NEAR); - } -} - -static void Emit(GSEmitter* emitter, Math::Vec4<float24> (*output)[16]) { - emitter->Emit(*output); -} - -void JitShader::Compile_EMIT(Instruction instr) { - Label have_emitter, end; - mov(rax, qword[STATE + offsetof(UnitState, emitter_ptr)]); - test(rax, rax); - jnz(have_emitter); - - ABI_PushRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0); - mov(ABI_PARAM1, reinterpret_cast<size_t>("Execute EMIT on VS")); - CallFarFunction(*this, LogCritical); - ABI_PopRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0); - jmp(end); - - L(have_emitter); - ABI_PushRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0); - mov(ABI_PARAM1, rax); - mov(ABI_PARAM2, STATE); - add(ABI_PARAM2, static_cast<Xbyak::uint32>(offsetof(UnitState, registers.output))); - CallFarFunction(*this, Emit); - ABI_PopRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0); - L(end); -} - -void JitShader::Compile_SETE(Instruction instr) { - Label have_emitter, end; - mov(rax, qword[STATE + offsetof(UnitState, emitter_ptr)]); - test(rax, rax); - jnz(have_emitter); - - ABI_PushRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0); - mov(ABI_PARAM1, reinterpret_cast<size_t>("Execute SETEMIT on VS")); - CallFarFunction(*this, LogCritical); - ABI_PopRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0); - jmp(end); - - L(have_emitter); - mov(byte[rax + offsetof(GSEmitter, vertex_id)], instr.setemit.vertex_id); - mov(byte[rax + offsetof(GSEmitter, prim_emit)], instr.setemit.prim_emit); - mov(byte[rax + offsetof(GSEmitter, winding)], instr.setemit.winding); - L(end); -} - -void JitShader::Compile_Block(unsigned end) { - while (program_counter < end) { - Compile_NextInstr(); - } -} - -void JitShader::Compile_Return() { - // Peek return offset on the stack and check if we're at that offset - mov(rax, qword[rsp + 8]); - cmp(eax, (program_counter)); - - // If so, jump back to before CALL - Label b; - jnz(b); - ret(); - L(b); -} - -void JitShader::Compile_NextInstr() { - if (std::binary_search(return_offsets.begin(), return_offsets.end(), program_counter)) { - Compile_Return(); - } - - L(instruction_labels[program_counter]); - - Instruction instr = {(*program_code)[program_counter++]}; - - OpCode::Id opcode = instr.opcode.Value(); - auto instr_func = instr_table[static_cast<unsigned>(opcode)]; - - if (instr_func) { - // JIT the instruction! - ((*this).*instr_func)(instr); - } else { - // Unhandled instruction - LOG_CRITICAL(HW_GPU, "Unhandled instruction: 0x%02x (0x%08x)", - instr.opcode.Value().EffectiveOpCode(), instr.hex); - } -} - -void JitShader::FindReturnOffsets() { - return_offsets.clear(); - - for (size_t offset = 0; offset < program_code->size(); ++offset) { - Instruction instr = {(*program_code)[offset]}; - - switch (instr.opcode.Value()) { - case OpCode::Id::CALL: - case OpCode::Id::CALLC: - case OpCode::Id::CALLU: - return_offsets.push_back(instr.flow_control.dest_offset + - instr.flow_control.num_instructions); - break; - default: - break; - } - } - - // Sort for efficient binary search later - std::sort(return_offsets.begin(), return_offsets.end()); -} - -void JitShader::Compile(const std::array<u32, MAX_PROGRAM_CODE_LENGTH>* program_code_, - const std::array<u32, MAX_SWIZZLE_DATA_LENGTH>* swizzle_data_) { - program_code = program_code_; - swizzle_data = swizzle_data_; - - // Reset flow control state - program = (CompiledShader*)getCurr(); - program_counter = 0; - looping = false; - instruction_labels.fill(Xbyak::Label()); - - // Find all `CALL` instructions and identify return locations - FindReturnOffsets(); - - // The stack pointer is 8 modulo 16 at the entry of a procedure - // We reserve 16 bytes and assign a dummy value to the first 8 bytes, to catch any potential - // return checks (see Compile_Return) that happen in shader main routine. - ABI_PushRegistersAndAdjustStack(*this, ABI_ALL_CALLEE_SAVED, 8, 16); - mov(qword[rsp + 8], 0xFFFFFFFFFFFFFFFFULL); - - mov(SETUP, ABI_PARAM1); - mov(STATE, ABI_PARAM2); - - // Zero address/loop registers - xor_(ADDROFFS_REG_0.cvt32(), ADDROFFS_REG_0.cvt32()); - xor_(ADDROFFS_REG_1.cvt32(), ADDROFFS_REG_1.cvt32()); - xor_(LOOPCOUNT_REG, LOOPCOUNT_REG); - - // Used to set a register to one - static const __m128 one = {1.f, 1.f, 1.f, 1.f}; - mov(rax, reinterpret_cast<size_t>(&one)); - movaps(ONE, xword[rax]); - - // Used to negate registers - static const __m128 neg = {-0.f, -0.f, -0.f, -0.f}; - mov(rax, reinterpret_cast<size_t>(&neg)); - movaps(NEGBIT, xword[rax]); - - // Jump to start of the shader program - jmp(ABI_PARAM3); - - // Compile entire program - Compile_Block(static_cast<unsigned>(program_code->size())); - - // Free memory that's no longer needed - program_code = nullptr; - swizzle_data = nullptr; - return_offsets.clear(); - return_offsets.shrink_to_fit(); - - ready(); - - ASSERT_MSG(getSize() <= MAX_SHADER_SIZE, "Compiled a shader that exceeds the allocated size!"); - LOG_DEBUG(HW_GPU, "Compiled shader size=%lu", getSize()); -} - -JitShader::JitShader() : Xbyak::CodeGenerator(MAX_SHADER_SIZE) {} - -} // namespace Shader - -} // namespace Pica diff --git a/src/video_core/shader/shader_jit_x64_compiler.h b/src/video_core/shader/shader_jit_x64_compiler.h deleted file mode 100644 index 4aee56b1d..000000000 --- a/src/video_core/shader/shader_jit_x64_compiler.h +++ /dev/null @@ -1,127 +0,0 @@ -// Copyright 2015 Citra Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#pragma once - -#include <array> -#include <cstddef> -#include <utility> -#include <vector> -#include <nihstro/shader_bytecode.h> -#include <xbyak.h> -#include "common/bit_set.h" -#include "common/common_types.h" -#include "video_core/shader/shader.h" - -using nihstro::Instruction; -using nihstro::OpCode; -using nihstro::SwizzlePattern; - -namespace Pica { - -namespace Shader { - -/// Memory allocated for each compiled shader -constexpr size_t MAX_SHADER_SIZE = MAX_PROGRAM_CODE_LENGTH * 64; - -/** - * This class implements the shader JIT compiler. It recompiles a Pica shader program into x86_64 - * code that can be executed on the host machine directly. - */ -class JitShader : public Xbyak::CodeGenerator { -public: - JitShader(); - - void Run(const ShaderSetup& setup, UnitState& state, unsigned offset) const { - program(&setup, &state, instruction_labels[offset].getAddress()); - } - - void Compile(const std::array<u32, MAX_PROGRAM_CODE_LENGTH>* program_code, - const std::array<u32, MAX_SWIZZLE_DATA_LENGTH>* swizzle_data); - - void Compile_ADD(Instruction instr); - void Compile_DP3(Instruction instr); - void Compile_DP4(Instruction instr); - void Compile_DPH(Instruction instr); - void Compile_EX2(Instruction instr); - void Compile_LG2(Instruction instr); - void Compile_MUL(Instruction instr); - void Compile_SGE(Instruction instr); - void Compile_SLT(Instruction instr); - void Compile_FLR(Instruction instr); - void Compile_MAX(Instruction instr); - void Compile_MIN(Instruction instr); - void Compile_RCP(Instruction instr); - void Compile_RSQ(Instruction instr); - void Compile_MOVA(Instruction instr); - void Compile_MOV(Instruction instr); - void Compile_NOP(Instruction instr); - void Compile_END(Instruction instr); - void Compile_CALL(Instruction instr); - void Compile_CALLC(Instruction instr); - void Compile_CALLU(Instruction instr); - void Compile_IF(Instruction instr); - void Compile_LOOP(Instruction instr); - void Compile_JMP(Instruction instr); - void Compile_CMP(Instruction instr); - void Compile_MAD(Instruction instr); - void Compile_EMIT(Instruction instr); - void Compile_SETE(Instruction instr); - -private: - void Compile_Block(unsigned end); - void Compile_NextInstr(); - - void Compile_SwizzleSrc(Instruction instr, unsigned src_num, SourceRegister src_reg, - Xbyak::Xmm dest); - void Compile_DestEnable(Instruction instr, Xbyak::Xmm dest); - - /** - * Compiles a `MUL src1, src2` operation, properly handling the PICA semantics when multiplying - * zero by inf. Clobbers `src2` and `scratch`. - */ - void Compile_SanitizedMul(Xbyak::Xmm src1, Xbyak::Xmm src2, Xbyak::Xmm scratch); - - void Compile_EvaluateCondition(Instruction instr); - void Compile_UniformCondition(Instruction instr); - - /** - * Emits the code to conditionally return from a subroutine envoked by the `CALL` instruction. - */ - void Compile_Return(); - - BitSet32 PersistentCallerSavedRegs(); - - /** - * Assertion evaluated at compile-time, but only triggered if executed at runtime. - * @param condition Condition to be evaluated. - * @param msg Message to be logged if the assertion fails. - */ - void Compile_Assert(bool condition, const char* msg); - - /** - * Analyzes the entire shader program for `CALL` instructions before emitting any code, - * identifying the locations where a return needs to be inserted. - */ - void FindReturnOffsets(); - - const std::array<u32, MAX_PROGRAM_CODE_LENGTH>* program_code = nullptr; - const std::array<u32, MAX_SWIZZLE_DATA_LENGTH>* swizzle_data = nullptr; - - /// Mapping of Pica VS instructions to pointers in the emitted code - std::array<Xbyak::Label, MAX_PROGRAM_CODE_LENGTH> instruction_labels; - - /// Offsets in code where a return needs to be inserted - std::vector<unsigned> return_offsets; - - unsigned program_counter = 0; ///< Offset of the next instruction to decode - bool looping = false; ///< True if compiling a loop, used to check for nested loops - - using CompiledShader = void(const void* setup, void* state, const u8* start_addr); - CompiledShader* program = nullptr; -}; - -} // Shader - -} // Pica |