summaryrefslogtreecommitdiffstats
path: root/src/video_core/shader
diff options
context:
space:
mode:
Diffstat (limited to 'src/video_core/shader')
-rw-r--r--src/video_core/shader/shader.cpp55
-rw-r--r--src/video_core/shader/shader.h197
-rw-r--r--src/video_core/shader/shader_interpreter.cpp154
-rw-r--r--src/video_core/shader/shader_interpreter.h3
-rw-r--r--src/video_core/shader/shader_jit_x64.cpp108
-rw-r--r--src/video_core/shader/shader_jit_x64.h8
6 files changed, 464 insertions, 61 deletions
diff --git a/src/video_core/shader/shader.cpp b/src/video_core/shader/shader.cpp
index 6a27a8015..4e9836c80 100644
--- a/src/video_core/shader/shader.cpp
+++ b/src/video_core/shader/shader.cpp
@@ -5,6 +5,8 @@
#include <memory>
#include <unordered_map>
+#include <boost/range/algorithm/fill.hpp>
+
#include "common/hash.h"
#include "common/make_unique.h"
#include "common/profiler.h"
@@ -30,7 +32,7 @@ static JitCompiler jit;
static CompiledShader* jit_shader;
#endif // ARCHITECTURE_x86_64
-void Setup(UnitState& state) {
+void Setup(UnitState<false>& state) {
#ifdef ARCHITECTURE_x86_64
if (VideoCore::g_shader_jit_enabled) {
u64 cache_key = (Common::ComputeHash64(&g_state.vs.program_code, sizeof(g_state.vs.program_code)) ^
@@ -54,9 +56,8 @@ void Shutdown() {
static Common::Profiling::TimingCategory shader_category("Vertex Shader");
-OutputVertex Run(UnitState& state, const InputVertex& input, int num_attributes) {
+OutputVertex Run(UnitState<false>& state, const InputVertex& input, int num_attributes) {
auto& config = g_state.regs.vs;
- auto& setup = g_state.vs;
Common::Profiling::ScopeTimer timer(shader_category);
@@ -67,6 +68,8 @@ OutputVertex Run(UnitState& state, const InputVertex& input, int num_attributes)
// Setup input register table
const auto& attribute_register_map = config.input_register_map;
+ // TODO: Instead of this cumbersome logic, just load the input data directly like
+ // for (int attr = 0; attr < num_attributes; ++attr) { input_attr[0] = state.registers.input[attribute_register_map.attribute0_register]; }
if (num_attributes > 0) state.registers.input[attribute_register_map.attribute0_register] = input.attr[0];
if (num_attributes > 1) state.registers.input[attribute_register_map.attribute1_register] = input.attr[1];
if (num_attributes > 2) state.registers.input[attribute_register_map.attribute2_register] = input.attr[2];
@@ -96,12 +99,6 @@ OutputVertex Run(UnitState& state, const InputVertex& input, int num_attributes)
RunInterpreter(state);
#endif // ARCHITECTURE_x86_64
-#if PICA_DUMP_SHADERS
- DebugUtils::DumpShader(setup.program_code.data(), state.debug.max_offset, setup.swizzle_data.data(),
- state.debug.max_opdesc_id, config.main_offset,
- g_state.regs.vs_output_attributes); // TODO: Don't hardcode VS here
-#endif
-
// Setup output data
OutputVertex ret;
// TODO(neobrain): Under some circumstances, up to 16 attributes may be output. We need to
@@ -132,14 +129,52 @@ OutputVertex Run(UnitState& state, const InputVertex& input, int num_attributes)
std::fmin(std::fabs(ret.color[i].ToFloat32()), 1.0f));
}
- LOG_TRACE(Render_Software, "Output vertex: pos (%.2f, %.2f, %.2f, %.2f), col(%.2f, %.2f, %.2f, %.2f), tc0(%.2f, %.2f)",
+ LOG_TRACE(Render_Software, "Output vertex: pos (%.2f, %.2f, %.2f, %.2f), quat (%.2f, %.2f, %.2f, %.2f), col(%.2f, %.2f, %.2f, %.2f), tc0(%.2f, %.2f)",
ret.pos.x.ToFloat32(), ret.pos.y.ToFloat32(), ret.pos.z.ToFloat32(), ret.pos.w.ToFloat32(),
+ ret.quat.x.ToFloat32(), ret.quat.y.ToFloat32(), ret.quat.z.ToFloat32(), ret.quat.w.ToFloat32(),
ret.color.x.ToFloat32(), ret.color.y.ToFloat32(), ret.color.z.ToFloat32(), ret.color.w.ToFloat32(),
ret.tc0.u().ToFloat32(), ret.tc0.v().ToFloat32());
return ret;
}
+DebugData<true> ProduceDebugInfo(const InputVertex& input, int num_attributes, const Regs::ShaderConfig& config, const State::ShaderSetup& setup) {
+ UnitState<true> state;
+
+ const auto& shader_memory = setup.program_code;
+ state.program_counter = config.main_offset;
+ state.debug.max_offset = 0;
+ state.debug.max_opdesc_id = 0;
+
+ // Setup input register table
+ const auto& attribute_register_map = config.input_register_map;
+ float24 dummy_register;
+ boost::fill(state.registers.input, &dummy_register);
+
+ if (num_attributes > 0) state.registers.input[attribute_register_map.attribute0_register] = &input.attr[0].x;
+ if (num_attributes > 1) state.registers.input[attribute_register_map.attribute1_register] = &input.attr[1].x;
+ if (num_attributes > 2) state.registers.input[attribute_register_map.attribute2_register] = &input.attr[2].x;
+ if (num_attributes > 3) state.registers.input[attribute_register_map.attribute3_register] = &input.attr[3].x;
+ if (num_attributes > 4) state.registers.input[attribute_register_map.attribute4_register] = &input.attr[4].x;
+ if (num_attributes > 5) state.registers.input[attribute_register_map.attribute5_register] = &input.attr[5].x;
+ if (num_attributes > 6) state.registers.input[attribute_register_map.attribute6_register] = &input.attr[6].x;
+ if (num_attributes > 7) state.registers.input[attribute_register_map.attribute7_register] = &input.attr[7].x;
+ if (num_attributes > 8) state.registers.input[attribute_register_map.attribute8_register] = &input.attr[8].x;
+ if (num_attributes > 9) state.registers.input[attribute_register_map.attribute9_register] = &input.attr[9].x;
+ if (num_attributes > 10) state.registers.input[attribute_register_map.attribute10_register] = &input.attr[10].x;
+ if (num_attributes > 11) state.registers.input[attribute_register_map.attribute11_register] = &input.attr[11].x;
+ if (num_attributes > 12) state.registers.input[attribute_register_map.attribute12_register] = &input.attr[12].x;
+ if (num_attributes > 13) state.registers.input[attribute_register_map.attribute13_register] = &input.attr[13].x;
+ if (num_attributes > 14) state.registers.input[attribute_register_map.attribute14_register] = &input.attr[14].x;
+ if (num_attributes > 15) state.registers.input[attribute_register_map.attribute15_register] = &input.attr[15].x;
+
+ state.conditional_code[0] = false;
+ state.conditional_code[1] = false;
+
+ RunInterpreter(state);
+ return state.debug;
+}
+
} // namespace Shader
} // namespace Pica
diff --git a/src/video_core/shader/shader.h b/src/video_core/shader/shader.h
index 2007a2844..bac51ddd8 100644
--- a/src/video_core/shader/shader.h
+++ b/src/video_core/shader/shader.h
@@ -4,7 +4,10 @@
#pragma once
+#include <vector>
+
#include <boost/container/static_vector.hpp>
+
#include <nihstro/shader_binary.h>
#include "common/common_funcs.h"
@@ -30,7 +33,7 @@ struct OutputVertex {
// VS output attributes
Math::Vec4<float24> pos;
- Math::Vec4<float24> dummy; // quaternions (not implemented, yet)
+ Math::Vec4<float24> quat;
Math::Vec4<float24> color;
Math::Vec2<float24> tc0;
Math::Vec2<float24> tc1;
@@ -72,12 +75,185 @@ struct OutputVertex {
static_assert(std::is_pod<OutputVertex>::value, "Structure is not POD");
static_assert(sizeof(OutputVertex) == 32 * sizeof(float), "OutputVertex has invalid size");
+
+// Helper structure used to keep track of data useful for inspection of shader emulation
+template<bool full_debugging>
+struct DebugData;
+
+template<>
+struct DebugData<false> {
+ // TODO: Hide these behind and interface and move them to DebugData<true>
+ u32 max_offset; // maximum program counter ever reached
+ u32 max_opdesc_id; // maximum swizzle pattern index ever used
+};
+
+template<>
+struct DebugData<true> {
+ // Records store the input and output operands of a particular instruction.
+ struct Record {
+ enum Type {
+ // Floating point arithmetic operands
+ SRC1 = 0x1,
+ SRC2 = 0x2,
+ SRC3 = 0x4,
+
+ // Initial and final output operand value
+ DEST_IN = 0x8,
+ DEST_OUT = 0x10,
+
+ // Current and next instruction offset (in words)
+ CUR_INSTR = 0x20,
+ NEXT_INSTR = 0x40,
+
+ // Output address register value
+ ADDR_REG_OUT = 0x80,
+
+ // Result of a comparison instruction
+ CMP_RESULT = 0x100,
+
+ // Input values for conditional flow control instructions
+ COND_BOOL_IN = 0x200,
+ COND_CMP_IN = 0x400,
+
+ // Input values for a loop
+ LOOP_INT_IN = 0x800,
+ };
+
+ Math::Vec4<float24> src1;
+ Math::Vec4<float24> src2;
+ Math::Vec4<float24> src3;
+
+ Math::Vec4<float24> dest_in;
+ Math::Vec4<float24> dest_out;
+
+ s32 address_registers[2];
+ bool conditional_code[2];
+ bool cond_bool;
+ bool cond_cmp[2];
+ Math::Vec4<u8> loop_int;
+
+ u32 instruction_offset;
+ u32 next_instruction;
+
+ // set of enabled fields (as a combination of Type flags)
+ unsigned mask = 0;
+ };
+
+ u32 max_offset; // maximum program counter ever reached
+ u32 max_opdesc_id; // maximum swizzle pattern index ever used
+
+ // List of records for each executed shader instruction
+ std::vector<DebugData<true>::Record> records;
+};
+
+// Type alias for better readability
+using DebugDataRecord = DebugData<true>::Record;
+
+// Helper function to set a DebugData<true>::Record field based on the template enum parameter.
+template<DebugDataRecord::Type type, typename ValueType>
+inline void SetField(DebugDataRecord& record, ValueType value);
+
+template<>
+inline void SetField<DebugDataRecord::SRC1>(DebugDataRecord& record, float24* value) {
+ record.src1.x = value[0];
+ record.src1.y = value[1];
+ record.src1.z = value[2];
+ record.src1.w = value[3];
+}
+
+template<>
+inline void SetField<DebugDataRecord::SRC2>(DebugDataRecord& record, float24* value) {
+ record.src2.x = value[0];
+ record.src2.y = value[1];
+ record.src2.z = value[2];
+ record.src2.w = value[3];
+}
+
+template<>
+inline void SetField<DebugDataRecord::SRC3>(DebugDataRecord& record, float24* value) {
+ record.src3.x = value[0];
+ record.src3.y = value[1];
+ record.src3.z = value[2];
+ record.src3.w = value[3];
+}
+
+template<>
+inline void SetField<DebugDataRecord::DEST_IN>(DebugDataRecord& record, float24* value) {
+ record.dest_in.x = value[0];
+ record.dest_in.y = value[1];
+ record.dest_in.z = value[2];
+ record.dest_in.w = value[3];
+}
+
+template<>
+inline void SetField<DebugDataRecord::DEST_OUT>(DebugDataRecord& record, float24* value) {
+ record.dest_out.x = value[0];
+ record.dest_out.y = value[1];
+ record.dest_out.z = value[2];
+ record.dest_out.w = value[3];
+}
+
+template<>
+inline void SetField<DebugDataRecord::ADDR_REG_OUT>(DebugDataRecord& record, s32* value) {
+ record.address_registers[0] = value[0];
+ record.address_registers[1] = value[1];
+}
+
+template<>
+inline void SetField<DebugDataRecord::CMP_RESULT>(DebugDataRecord& record, bool* value) {
+ record.conditional_code[0] = value[0];
+ record.conditional_code[1] = value[1];
+}
+
+template<>
+inline void SetField<DebugDataRecord::COND_BOOL_IN>(DebugDataRecord& record, bool value) {
+ record.cond_bool = value;
+}
+
+template<>
+inline void SetField<DebugDataRecord::COND_CMP_IN>(DebugDataRecord& record, bool* value) {
+ record.cond_cmp[0] = value[0];
+ record.cond_cmp[1] = value[1];
+}
+
+template<>
+inline void SetField<DebugDataRecord::LOOP_INT_IN>(DebugDataRecord& record, Math::Vec4<u8> value) {
+ record.loop_int = value;
+}
+
+template<>
+inline void SetField<DebugDataRecord::CUR_INSTR>(DebugDataRecord& record, u32 value) {
+ record.instruction_offset = value;
+}
+
+template<>
+inline void SetField<DebugDataRecord::NEXT_INSTR>(DebugDataRecord& record, u32 value) {
+ record.next_instruction = value;
+}
+
+// Helper function to set debug information on the current shader iteration.
+template<DebugDataRecord::Type type, typename ValueType>
+inline void Record(DebugData<false>& debug_data, u32 offset, ValueType value) {
+ // Debugging disabled => nothing to do
+}
+
+template<DebugDataRecord::Type type, typename ValueType>
+inline void Record(DebugData<true>& debug_data, u32 offset, ValueType value) {
+ if (offset >= debug_data.records.size())
+ debug_data.records.resize(offset + 1);
+
+ SetField<type, ValueType>(debug_data.records[offset], value);
+ debug_data.records[offset].mask |= type;
+}
+
+
/**
* This structure contains the state information that needs to be unique for a shader unit. The 3DS
* has four shader units that process shaders in parallel. At the present, Citra only implements a
* single shader unit that processes all shaders serially. Putting the state information in a struct
* here will make it easier for us to parallelize the shader processing later.
*/
+template<bool Debug>
struct UnitState {
struct Registers {
// The registers are accessed by the shader JIT using SSE instructions, and are therefore
@@ -111,10 +287,7 @@ struct UnitState {
// TODO: Is there a maximal size for this?
boost::container::static_vector<CallStackElement, 16> call_stack;
- struct {
- u32 max_offset; // maximum program counter ever reached
- u32 max_opdesc_id; // maximum swizzle pattern index ever used
- } debug;
+ DebugData<Debug> debug;
static int InputOffset(const SourceRegister& reg) {
switch (reg.GetRegisterType()) {
@@ -150,7 +323,7 @@ struct UnitState {
* vertex, which would happen within the `Run` function).
* @param state Shader unit state, must be setup per shader and per shader unit
*/
-void Setup(UnitState& state);
+void Setup(UnitState<false>& state);
/// Performs any cleanup when the emulator is shutdown
void Shutdown();
@@ -162,7 +335,17 @@ void Shutdown();
* @param num_attributes The number of vertex shader attributes
* @return The output vertex, after having been processed by the vertex shader
*/
-OutputVertex Run(UnitState& state, const InputVertex& input, int num_attributes);
+OutputVertex Run(UnitState<false>& state, const InputVertex& input, int num_attributes);
+
+/**
+ * Produce debug information based on the given shader and input vertex
+ * @param input Input vertex into the shader
+ * @param num_attributes The number of vertex shader attributes
+ * @param config Configuration object for the shader pipeline
+ * @param setup Setup object for the shader pipeline
+ * @return Debug information for this shader with regards to the given vertex
+ */
+DebugData<true> ProduceDebugInfo(const InputVertex& input, int num_attributes, const Regs::ShaderConfig& config, const State::ShaderSetup& setup);
} // namespace Shader
diff --git a/src/video_core/shader/shader_interpreter.cpp b/src/video_core/shader/shader_interpreter.cpp
index c8489f920..063cc38f0 100644
--- a/src/video_core/shader/shader_interpreter.cpp
+++ b/src/video_core/shader/shader_interpreter.cpp
@@ -21,7 +21,8 @@ namespace Pica {
namespace Shader {
-void RunInterpreter(UnitState& state) {
+template<bool Debug>
+void RunInterpreter(UnitState<Debug>& state) {
const auto& uniforms = g_state.vs.uniforms;
const auto& swizzle_data = g_state.vs.swizzle_data;
const auto& program_code = g_state.vs.program_code;
@@ -29,7 +30,9 @@ void RunInterpreter(UnitState& state) {
// Placeholder for invalid inputs
static float24 dummy_vec4_float24[4];
- while (true) {
+ unsigned iteration = 0;
+ bool exit_loop = false;
+ while (!exit_loop) {
if (!state.call_stack.empty()) {
auto& top = state.call_stack.back();
if (state.program_counter == top.final_address) {
@@ -47,16 +50,19 @@ void RunInterpreter(UnitState& state) {
}
}
- bool exit_loop = false;
const Instruction instr = { program_code[state.program_counter] };
const SwizzlePattern swizzle = { swizzle_data[instr.common.operand_desc_id] };
- static auto call = [](UnitState& state, u32 offset, u32 num_instructions,
+ static auto call = [](UnitState<Debug>& state, u32 offset, u32 num_instructions,
u32 return_offset, u8 repeat_count, u8 loop_increment) {
state.program_counter = offset - 1; // -1 to make sure when incrementing the PC we end up at the correct offset
ASSERT(state.call_stack.size() < state.call_stack.capacity());
state.call_stack.push_back({ offset + num_instructions, return_offset, repeat_count, loop_increment, offset });
};
+ Record<DebugDataRecord::CUR_INSTR>(state.debug, iteration, state.program_counter);
+ if (iteration > 0)
+ Record<DebugDataRecord::NEXT_INSTR>(state.debug, iteration - 1, state.program_counter);
+
state.debug.max_offset = std::max<u32>(state.debug.max_offset, 1 + state.program_counter);
auto LookupSourceRegister = [&](const SourceRegister& source_reg) -> const float24* {
@@ -123,58 +129,78 @@ void RunInterpreter(UnitState& state) {
switch (instr.opcode.Value().EffectiveOpCode()) {
case OpCode::Id::ADD:
{
+ Record<DebugDataRecord::SRC1>(state.debug, iteration, src1);
+ Record<DebugDataRecord::SRC2>(state.debug, iteration, src2);
+ Record<DebugDataRecord::DEST_IN>(state.debug, iteration, dest);
for (int i = 0; i < 4; ++i) {
if (!swizzle.DestComponentEnabled(i))
continue;
dest[i] = src1[i] + src2[i];
}
-
+ Record<DebugDataRecord::DEST_OUT>(state.debug, iteration, dest);
break;
}
case OpCode::Id::MUL:
{
+ Record<DebugDataRecord::SRC1>(state.debug, iteration, src1);
+ Record<DebugDataRecord::SRC2>(state.debug, iteration, src2);
+ Record<DebugDataRecord::DEST_IN>(state.debug, iteration, dest);
for (int i = 0; i < 4; ++i) {
if (!swizzle.DestComponentEnabled(i))
continue;
dest[i] = src1[i] * src2[i];
}
-
+ Record<DebugDataRecord::DEST_OUT>(state.debug, iteration, dest);
break;
}
case OpCode::Id::FLR:
+ Record<DebugDataRecord::SRC1>(state.debug, iteration, src1);
+ Record<DebugDataRecord::DEST_IN>(state.debug, iteration, dest);
for (int i = 0; i < 4; ++i) {
if (!swizzle.DestComponentEnabled(i))
continue;
dest[i] = float24::FromFloat32(std::floor(src1[i].ToFloat32()));
}
+ Record<DebugDataRecord::DEST_OUT>(state.debug, iteration, dest);
break;
case OpCode::Id::MAX:
+ Record<DebugDataRecord::SRC1>(state.debug, iteration, src1);
+ Record<DebugDataRecord::SRC2>(state.debug, iteration, src2);
+ Record<DebugDataRecord::DEST_IN>(state.debug, iteration, dest);
for (int i = 0; i < 4; ++i) {
if (!swizzle.DestComponentEnabled(i))
continue;
dest[i] = std::max(src1[i], src2[i]);
}
+ Record<DebugDataRecord::DEST_OUT>(state.debug, iteration, dest);
break;
case OpCode::Id::MIN:
+ Record<DebugDataRecord::SRC1>(state.debug, iteration, src1);
+ Record<DebugDataRecord::SRC2>(state.debug, iteration, src2);
+ Record<DebugDataRecord::DEST_IN>(state.debug, iteration, dest);
for (int i = 0; i < 4; ++i) {
if (!swizzle.DestComponentEnabled(i))
continue;
dest[i] = std::min(src1[i], src2[i]);
}
+ Record<DebugDataRecord::DEST_OUT>(state.debug, iteration, dest);
break;
case OpCode::Id::DP3:
case OpCode::Id::DP4:
{
+ Record<DebugDataRecord::SRC1>(state.debug, iteration, src1);
+ Record<DebugDataRecord::SRC2>(state.debug, iteration, src2);
+ Record<DebugDataRecord::DEST_IN>(state.debug, iteration, dest);
float24 dot = float24::FromFloat32(0.f);
int num_components = (instr.opcode.Value() == OpCode::Id::DP3) ? 3 : 4;
for (int i = 0; i < num_components; ++i)
@@ -186,12 +212,15 @@ void RunInterpreter(UnitState& state) {
dest[i] = dot;
}
+ Record<DebugDataRecord::DEST_OUT>(state.debug, iteration, dest);
break;
}
// Reciprocal
case OpCode::Id::RCP:
{
+ Record<DebugDataRecord::SRC1>(state.debug, iteration, src1);
+ Record<DebugDataRecord::DEST_IN>(state.debug, iteration, dest);
for (int i = 0; i < 4; ++i) {
if (!swizzle.DestComponentEnabled(i))
continue;
@@ -200,13 +229,15 @@ void RunInterpreter(UnitState& state) {
// TODO: I think this might be wrong... we should only use one component here
dest[i] = float24::FromFloat32(1.0f / src1[i].ToFloat32());
}
-
+ Record<DebugDataRecord::DEST_OUT>(state.debug, iteration, dest);
break;
}
// Reciprocal Square Root
case OpCode::Id::RSQ:
{
+ Record<DebugDataRecord::SRC1>(state.debug, iteration, src1);
+ Record<DebugDataRecord::DEST_IN>(state.debug, iteration, dest);
for (int i = 0; i < 4; ++i) {
if (!swizzle.DestComponentEnabled(i))
continue;
@@ -215,12 +246,13 @@ void RunInterpreter(UnitState& state) {
// TODO: I think this might be wrong... we should only use one component here
dest[i] = float24::FromFloat32(1.0f / sqrt(src1[i].ToFloat32()));
}
-
+ Record<DebugDataRecord::DEST_OUT>(state.debug, iteration, dest);
break;
}
case OpCode::Id::MOVA:
{
+ Record<DebugDataRecord::SRC1>(state.debug, iteration, src1);
for (int i = 0; i < 2; ++i) {
if (!swizzle.DestComponentEnabled(i))
continue;
@@ -228,32 +260,55 @@ void RunInterpreter(UnitState& state) {
// TODO: Figure out how the rounding is done on hardware
state.address_registers[i] = static_cast<s32>(src1[i].ToFloat32());
}
-
+ Record<DebugDataRecord::ADDR_REG_OUT>(state.debug, iteration, state.address_registers);
break;
}
case OpCode::Id::MOV:
{
+ Record<DebugDataRecord::SRC1>(state.debug, iteration, src1);
+ Record<DebugDataRecord::DEST_IN>(state.debug, iteration, dest);
for (int i = 0; i < 4; ++i) {
if (!swizzle.DestComponentEnabled(i))
continue;
dest[i] = src1[i];
}
+ Record<DebugDataRecord::DEST_OUT>(state.debug, iteration, dest);
break;
}
+ case OpCode::Id::SGE:
+ case OpCode::Id::SGEI:
+ Record<DebugDataRecord::SRC1>(state.debug, iteration, src1);
+ Record<DebugDataRecord::SRC2>(state.debug, iteration, src2);
+ Record<DebugDataRecord::DEST_IN>(state.debug, iteration, dest);
+ for (int i = 0; i < 4; ++i) {
+ if (!swizzle.DestComponentEnabled(i))
+ continue;
+
+ dest[i] = (src1[i] >= src2[i]) ? float24::FromFloat32(1.0f) : float24::FromFloat32(0.0f);
+ }
+ Record<DebugDataRecord::DEST_OUT>(state.debug, iteration, dest);
+ break;
+
case OpCode::Id::SLT:
case OpCode::Id::SLTI:
+ Record<DebugDataRecord::SRC1>(state.debug, iteration, src1);
+ Record<DebugDataRecord::SRC2>(state.debug, iteration, src2);
+ Record<DebugDataRecord::DEST_IN>(state.debug, iteration, dest);
for (int i = 0; i < 4; ++i) {
if (!swizzle.DestComponentEnabled(i))
continue;
dest[i] = (src1[i] < src2[i]) ? float24::FromFloat32(1.0f) : float24::FromFloat32(0.0f);
}
+ Record<DebugDataRecord::DEST_OUT>(state.debug, iteration, dest);
break;
case OpCode::Id::CMP:
+ Record<DebugDataRecord::SRC1>(state.debug, iteration, src1);
+ Record<DebugDataRecord::SRC2>(state.debug, iteration, src2);
for (int i = 0; i < 2; ++i) {
// TODO: Can you restrict to one compare via dest masking?
@@ -261,27 +316,27 @@ void RunInterpreter(UnitState& state) {
auto op = (i == 0) ? compare_op.x.Value() : compare_op.y.Value();
switch (op) {
- case compare_op.Equal:
+ case Instruction::Common::CompareOpType::Equal:
state.conditional_code[i] = (src1[i] == src2[i]);
break;
- case compare_op.NotEqual:
+ case Instruction::Common::CompareOpType::NotEqual:
state.conditional_code[i] = (src1[i] != src2[i]);
break;
- case compare_op.LessThan:
+ case Instruction::Common::CompareOpType::LessThan:
state.conditional_code[i] = (src1[i] < src2[i]);
break;
- case compare_op.LessEqual:
+ case Instruction::Common::CompareOpType::LessEqual:
state.conditional_code[i] = (src1[i] <= src2[i]);
break;
- case compare_op.GreaterThan:
+ case Instruction::Common::CompareOpType::GreaterThan:
state.conditional_code[i] = (src1[i] > src2[i]);
break;
- case compare_op.GreaterEqual:
+ case Instruction::Common::CompareOpType::GreaterEqual:
state.conditional_code[i] = (src1[i] >= src2[i]);
break;
@@ -290,7 +345,44 @@ void RunInterpreter(UnitState& state) {
break;
}
}
+ Record<DebugDataRecord::CMP_RESULT>(state.debug, iteration, state.conditional_code);
+ break;
+
+ case OpCode::Id::EX2:
+ {
+ Record<DebugDataRecord::SRC1>(state.debug, iteration, src1);
+ Record<DebugDataRecord::DEST_IN>(state.debug, iteration, dest);
+
+ // EX2 only takes first component exp2 and writes it to all dest components
+ float24 ex2_res = float24::FromFloat32(std::exp2(src1[0].ToFloat32()));
+ for (int i = 0; i < 4; ++i) {
+ if (!swizzle.DestComponentEnabled(i))
+ continue;
+
+ dest[i] = ex2_res;
+ }
+
+ Record<DebugDataRecord::DEST_OUT>(state.debug, iteration, dest);
break;
+ }
+
+ case OpCode::Id::LG2:
+ {
+ Record<DebugDataRecord::SRC1>(state.debug, iteration, src1);
+ Record<DebugDataRecord::DEST_IN>(state.debug, iteration, dest);
+
+ // LG2 only takes the first component log2 and writes it to all dest components
+ float24 lg2_res = float24::FromFloat32(std::log2(src1[0].ToFloat32()));
+ for (int i = 0; i < 4; ++i) {
+ if (!swizzle.DestComponentEnabled(i))
+ continue;
+
+ dest[i] = lg2_res;
+ }
+
+ Record<DebugDataRecord::DEST_OUT>(state.debug, iteration, dest);
+ break;
+ }
default:
LOG_ERROR(HW_GPU, "Unhandled arithmetic instruction: 0x%02x (%s): 0x%08x",
@@ -359,12 +451,17 @@ void RunInterpreter(UnitState& state) {
: (instr.mad.dest.Value() < 0x20) ? &state.registers.temporary[instr.mad.dest.Value().GetIndex()][0]
: dummy_vec4_float24;
+ Record<DebugDataRecord::SRC1>(state.debug, iteration, src1);
+ Record<DebugDataRecord::SRC2>(state.debug, iteration, src2);
+ Record<DebugDataRecord::SRC3>(state.debug, iteration, src3);
+ Record<DebugDataRecord::DEST_IN>(state.debug, iteration, dest);
for (int i = 0; i < 4; ++i) {
if (!swizzle.DestComponentEnabled(i))
continue;
dest[i] = src1[i] * src2[i] + src3[i];
}
+ Record<DebugDataRecord::DEST_OUT>(state.debug, iteration, dest);
} else {
LOG_ERROR(HW_GPU, "Unhandled multiply-add instruction: 0x%02x (%s): 0x%08x",
(int)instr.opcode.Value().EffectiveOpCode(), instr.opcode.Value().GetInfo().name, instr.hex);
@@ -374,7 +471,7 @@ void RunInterpreter(UnitState& state) {
default:
{
- static auto evaluate_condition = [](const UnitState& state, bool refx, bool refy, Instruction::FlowControlType flow_control) {
+ static auto evaluate_condition = [](const UnitState<Debug>& state, bool refx, bool refy, Instruction::FlowControlType flow_control) {
bool results[2] = { refx == state.conditional_code[0],
refy == state.conditional_code[1] };
@@ -400,12 +497,14 @@ void RunInterpreter(UnitState& state) {
break;
case OpCode::Id::JMPC:
+ Record<DebugDataRecord::COND_CMP_IN>(state.debug, iteration, state.conditional_code);
if (evaluate_condition(state, instr.flow_control.refx, instr.flow_control.refy, instr.flow_control)) {
state.program_counter = instr.flow_control.dest_offset - 1;
}
break;
case OpCode::Id::JMPU:
+ Record<DebugDataRecord::COND_BOOL_IN>(state.debug, iteration, uniforms.b[instr.flow_control.bool_uniform_id]);
if (uniforms.b[instr.flow_control.bool_uniform_id]) {
state.program_counter = instr.flow_control.dest_offset - 1;
}
@@ -419,6 +518,7 @@ void RunInterpreter(UnitState& state) {
break;
case OpCode::Id::CALLU:
+ Record<DebugDataRecord::COND_BOOL_IN>(state.debug, iteration, uniforms.b[instr.flow_control.bool_uniform_id]);
if (uniforms.b[instr.flow_control.bool_uniform_id]) {
call(state,
instr.flow_control.dest_offset,
@@ -428,6 +528,7 @@ void RunInterpreter(UnitState& state) {
break;
case OpCode::Id::CALLC:
+ Record<DebugDataRecord::COND_CMP_IN>(state.debug, iteration, state.conditional_code);
if (evaluate_condition(state, instr.flow_control.refx, instr.flow_control.refy, instr.flow_control)) {
call(state,
instr.flow_control.dest_offset,
@@ -440,6 +541,7 @@ void RunInterpreter(UnitState& state) {
break;
case OpCode::Id::IFU:
+ Record<DebugDataRecord::COND_BOOL_IN>(state.debug, iteration, uniforms.b[instr.flow_control.bool_uniform_id]);
if (uniforms.b[instr.flow_control.bool_uniform_id]) {
call(state,
state.program_counter + 1,
@@ -458,6 +560,7 @@ void RunInterpreter(UnitState& state) {
{
// TODO: Do we need to consider swizzlers here?
+ Record<DebugDataRecord::COND_CMP_IN>(state.debug, iteration, state.conditional_code);
if (evaluate_condition(state, instr.flow_control.refx, instr.flow_control.refy, instr.flow_control)) {
call(state,
state.program_counter + 1,
@@ -475,14 +578,19 @@ void RunInterpreter(UnitState& state) {
case OpCode::Id::LOOP:
{
- state.address_registers[2] = uniforms.i[instr.flow_control.int_uniform_id].y;
+ Math::Vec4<u8> loop_param(uniforms.i[instr.flow_control.int_uniform_id].x,
+ uniforms.i[instr.flow_control.int_uniform_id].y,
+ uniforms.i[instr.flow_control.int_uniform_id].z,
+ uniforms.i[instr.flow_control.int_uniform_id].w);
+ state.address_registers[2] = loop_param.y;
+ Record<DebugDataRecord::LOOP_INT_IN>(state.debug, iteration, loop_param);
call(state,
state.program_counter + 1,
instr.flow_control.dest_offset - state.program_counter + 1,
instr.flow_control.dest_offset + 1,
- uniforms.i[instr.flow_control.int_uniform_id].x,
- uniforms.i[instr.flow_control.int_uniform_id].z);
+ loop_param.x,
+ loop_param.z);
break;
}
@@ -497,12 +605,14 @@ void RunInterpreter(UnitState& state) {
}
++state.program_counter;
-
- if (exit_loop)
- break;
+ ++iteration;
}
}
+// Explicit instantiation
+template void RunInterpreter(UnitState<false>& state);
+template void RunInterpreter(UnitState<true>& state);
+
} // namespace
} // namespace
diff --git a/src/video_core/shader/shader_interpreter.h b/src/video_core/shader/shader_interpreter.h
index ad6e58e39..71bcad5ac 100644
--- a/src/video_core/shader/shader_interpreter.h
+++ b/src/video_core/shader/shader_interpreter.h
@@ -12,7 +12,8 @@ namespace Pica {
namespace Shader {
-void RunInterpreter(UnitState& state);
+template<bool Debug>
+void RunInterpreter(UnitState<Debug>& state);
} // namespace
diff --git a/src/video_core/shader/shader_jit_x64.cpp b/src/video_core/shader/shader_jit_x64.cpp
index ce47774d5..a1bdd8456 100644
--- a/src/video_core/shader/shader_jit_x64.cpp
+++ b/src/video_core/shader/shader_jit_x64.cpp
@@ -25,12 +25,12 @@ const JitFunction instr_table[64] = {
&JitCompiler::Compile_DP4, // dp4
nullptr, // dph
nullptr, // unknown
- nullptr, // ex2
- nullptr, // lg2
+ &JitCompiler::Compile_EX2, // ex2
+ &JitCompiler::Compile_LG2, // lg2
nullptr, // unknown
&JitCompiler::Compile_MUL, // mul
- nullptr, // lge
- nullptr, // slt
+ &JitCompiler::Compile_SGE, // sge
+ &JitCompiler::Compile_SLT, // slt
&JitCompiler::Compile_FLR, // flr
&JitCompiler::Compile_MAX, // max
&JitCompiler::Compile_MIN, // min
@@ -46,8 +46,8 @@ const JitFunction instr_table[64] = {
nullptr, // unknown
nullptr, // dphi
nullptr, // unknown
- nullptr, // sgei
- &JitCompiler::Compile_SLTI, // slti
+ &JitCompiler::Compile_SGE, // sgei
+ &JitCompiler::Compile_SLT, // slti
nullptr, // unknown
nullptr, // unknown
nullptr, // unknown
@@ -141,7 +141,7 @@ void JitCompiler::Compile_SwizzleSrc(Instruction instr, unsigned src_num, Source
src_offset = src_reg.GetIndex() * sizeof(float24) * 4;
} else {
src_ptr = REGISTERS;
- src_offset = UnitState::InputOffset(src_reg);
+ src_offset = UnitState<false>::InputOffset(src_reg);
}
unsigned operand_desc_id;
@@ -217,11 +217,11 @@ void JitCompiler::Compile_DestEnable(Instruction instr,X64Reg src) {
// If all components are enabled, write the result to the destination register
if (swiz.dest_mask == NO_DEST_REG_MASK) {
// Store dest back to memory
- MOVAPS(MDisp(REGISTERS, UnitState::OutputOffset(dest)), src);
+ MOVAPS(MDisp(REGISTERS, UnitState<false>::OutputOffset(dest)), src);
} else {
// Not all components are enabled, so mask the result when storing to the destination register...
- MOVAPS(SCRATCH, MDisp(REGISTERS, UnitState::OutputOffset(dest)));
+ MOVAPS(SCRATCH, MDisp(REGISTERS, UnitState<false>::OutputOffset(dest)));
if (Common::GetCPUCaps().sse4_1) {
u8 mask = ((swiz.dest_mask & 1) << 3) | ((swiz.dest_mask & 8) >> 3) | ((swiz.dest_mask & 2) << 1) | ((swiz.dest_mask & 4) >> 1);
@@ -240,7 +240,7 @@ void JitCompiler::Compile_DestEnable(Instruction instr,X64Reg src) {
}
// Store dest back to memory
- MOVAPS(MDisp(REGISTERS, UnitState::OutputOffset(dest)), SCRATCH);
+ MOVAPS(MDisp(REGISTERS, UnitState<false>::OutputOffset(dest)), SCRATCH);
}
}
@@ -280,6 +280,22 @@ void JitCompiler::Compile_UniformCondition(Instruction instr) {
CMP(sizeof(bool) * 8, MDisp(UNIFORMS, offset), Imm8(0));
}
+void JitCompiler::Compile_PushCallerSavedXMM() {
+#ifndef _WIN32
+ SUB(64, R(RSP), Imm8(2 * 16));
+ MOVUPS(MDisp(RSP, 16), ONE);
+ MOVUPS(MDisp(RSP, 0), NEGBIT);
+#endif
+}
+
+void JitCompiler::Compile_PopCallerSavedXMM() {
+#ifndef _WIN32
+ MOVUPS(NEGBIT, MDisp(RSP, 0));
+ MOVUPS(ONE, MDisp(RSP, 16));
+ ADD(64, R(RSP), Imm8(2 * 16));
+#endif
+}
+
void JitCompiler::Compile_ADD(Instruction instr) {
Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
@@ -331,6 +347,38 @@ void JitCompiler::Compile_DP4(Instruction instr) {
Compile_DestEnable(instr, SRC1);
}
+void JitCompiler::Compile_EX2(Instruction instr) {
+ Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
+ MOVSS(XMM0, R(SRC1));
+
+ // The following will actually break the stack alignment
+ ABI_PushAllCallerSavedRegsAndAdjustStack();
+ Compile_PushCallerSavedXMM();
+ ABI_CallFunction(reinterpret_cast<const void*>(exp2f));
+ Compile_PopCallerSavedXMM();
+ ABI_PopAllCallerSavedRegsAndAdjustStack();
+
+ SHUFPS(XMM0, R(XMM0), _MM_SHUFFLE(0, 0, 0, 0));
+ MOVAPS(SRC1, R(XMM0));
+ Compile_DestEnable(instr, SRC1);
+}
+
+void JitCompiler::Compile_LG2(Instruction instr) {
+ Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
+ MOVSS(XMM0, R(SRC1));
+
+ // The following will actually break the stack alignment
+ ABI_PushAllCallerSavedRegsAndAdjustStack();
+ Compile_PushCallerSavedXMM();
+ ABI_CallFunction(reinterpret_cast<const void*>(log2f));
+ Compile_PopCallerSavedXMM();
+ ABI_PopAllCallerSavedRegsAndAdjustStack();
+
+ SHUFPS(XMM0, R(XMM0), _MM_SHUFFLE(0, 0, 0, 0));
+ MOVAPS(SRC1, R(XMM0));
+ Compile_DestEnable(instr, SRC1);
+}
+
void JitCompiler::Compile_MUL(Instruction instr) {
Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
@@ -338,6 +386,36 @@ void JitCompiler::Compile_MUL(Instruction instr) {
Compile_DestEnable(instr, SRC1);
}
+void JitCompiler::Compile_SGE(Instruction instr) {
+ if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::SGEI) {
+ Compile_SwizzleSrc(instr, 1, instr.common.src1i, SRC1);
+ Compile_SwizzleSrc(instr, 2, instr.common.src2i, SRC2);
+ } else {
+ Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
+ Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
+ }
+
+ CMPPS(SRC1, R(SRC2), CMP_NLT);
+ ANDPS(SRC1, R(ONE));
+
+ Compile_DestEnable(instr, SRC1);
+}
+
+void JitCompiler::Compile_SLT(Instruction instr) {
+ if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::SLTI) {
+ Compile_SwizzleSrc(instr, 1, instr.common.src1i, SRC1);
+ Compile_SwizzleSrc(instr, 2, instr.common.src2i, SRC2);
+ } else {
+ Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
+ Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
+ }
+
+ CMPPS(SRC1, R(SRC2), CMP_LT);
+ ANDPS(SRC1, R(ONE));
+
+ Compile_DestEnable(instr, SRC1);
+}
+
void JitCompiler::Compile_FLR(Instruction instr) {
Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
@@ -415,16 +493,6 @@ void JitCompiler::Compile_MOV(Instruction instr) {
Compile_DestEnable(instr, SRC1);
}
-void JitCompiler::Compile_SLTI(Instruction instr) {
- Compile_SwizzleSrc(instr, 1, instr.common.src1i, SRC1);
- Compile_SwizzleSrc(instr, 1, instr.common.src2i, SRC2);
-
- CMPSS(SRC1, R(SRC2), CMP_LT);
- ANDPS(SRC1, R(ONE));
-
- Compile_DestEnable(instr, SRC1);
-}
-
void JitCompiler::Compile_RCP(Instruction instr) {
Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
diff --git a/src/video_core/shader/shader_jit_x64.h b/src/video_core/shader/shader_jit_x64.h
index b88f2a0d2..b2aa5293c 100644
--- a/src/video_core/shader/shader_jit_x64.h
+++ b/src/video_core/shader/shader_jit_x64.h
@@ -37,7 +37,11 @@ public:
void Compile_ADD(Instruction instr);
void Compile_DP3(Instruction instr);
void Compile_DP4(Instruction instr);
+ void Compile_EX2(Instruction instr);
+ void Compile_LG2(Instruction instr);
void Compile_MUL(Instruction instr);
+ void Compile_SGE(Instruction instr);
+ void Compile_SLT(Instruction instr);
void Compile_FLR(Instruction instr);
void Compile_MAX(Instruction instr);
void Compile_MIN(Instruction instr);
@@ -45,7 +49,6 @@ public:
void Compile_RSQ(Instruction instr);
void Compile_MOVA(Instruction instr);
void Compile_MOV(Instruction instr);
- void Compile_SLTI(Instruction instr);
void Compile_NOP(Instruction instr);
void Compile_END(Instruction instr);
void Compile_CALL(Instruction instr);
@@ -67,6 +70,9 @@ private:
void Compile_EvaluateCondition(Instruction instr);
void Compile_UniformCondition(Instruction instr);
+ void Compile_PushCallerSavedXMM();
+ void Compile_PopCallerSavedXMM();
+
/// Pointer to the variable that stores the current Pica code offset. Used to handle nested code blocks.
unsigned* offset_ptr = nullptr;