summaryrefslogtreecommitdiffstats
path: root/src/video_core
diff options
context:
space:
mode:
Diffstat (limited to 'src/video_core')
-rw-r--r--src/video_core/command_processor.cpp23
-rw-r--r--src/video_core/debug_utils/debug_utils.cpp24
-rw-r--r--src/video_core/pica.h12
-rw-r--r--src/video_core/shader/shader.cpp16
-rw-r--r--src/video_core/shader/shader_interpreter.cpp17
-rw-r--r--src/video_core/shader/shader_jit_x64.cpp68
-rw-r--r--src/video_core/shader/shader_jit_x64.h5
7 files changed, 94 insertions, 71 deletions
diff --git a/src/video_core/command_processor.cpp b/src/video_core/command_processor.cpp
index 54721561e..4b59984ad 100644
--- a/src/video_core/command_processor.cpp
+++ b/src/video_core/command_processor.cpp
@@ -200,7 +200,7 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
for (int loader = 0; loader < 12; ++loader) {
const auto& loader_config = attribute_config.attribute_loaders[loader];
- u32 load_address = base_address + loader_config.data_offset;
+ u32 offset = 0;
// TODO: What happens if a loader overwrites a previous one's data?
for (unsigned component = 0; component < loader_config.component_count; ++component) {
@@ -212,17 +212,17 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
u32 attribute_index = loader_config.GetComponent(component);
if (attribute_index < 12) {
int element_size = attribute_config.GetElementSizeInBytes(attribute_index);
- load_address = Common::AlignUp(load_address, element_size);
- vertex_attribute_sources[attribute_index] = load_address;
+ offset = Common::AlignUp(offset, element_size);
+ vertex_attribute_sources[attribute_index] = base_address + loader_config.data_offset + offset;
vertex_attribute_strides[attribute_index] = static_cast<u32>(loader_config.byte_count);
vertex_attribute_formats[attribute_index] = attribute_config.GetFormat(attribute_index);
vertex_attribute_elements[attribute_index] = attribute_config.GetNumElements(attribute_index);
vertex_attribute_element_size[attribute_index] = element_size;
- load_address += attribute_config.GetStride(attribute_index);
+ offset += attribute_config.GetStride(attribute_index);
} else if (attribute_index < 16) {
// Attribute ids 12, 13, 14 and 15 signify 4, 8, 12 and 16-byte paddings, respectively
- load_address = Common::AlignUp(load_address, 4);
- load_address += (attribute_index - 11) * 4;
+ offset = Common::AlignUp(offset, 4);
+ offset += (attribute_index - 11) * 4;
} else {
UNREACHABLE(); // This is truly unreachable due to the number of bits for each component
}
@@ -234,7 +234,7 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
const auto& index_info = regs.index_array;
const u8* index_address_8 = Memory::GetPhysicalPointer(base_address + index_info.offset);
- const u16* index_address_16 = (u16*)index_address_8;
+ const u16* index_address_16 = reinterpret_cast<const u16*>(index_address_8);
bool index_u16 = index_info.format != 0;
#if PICA_DUMP_GEOMETRY
@@ -345,10 +345,11 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
: (vertex_attribute_formats[i] == Regs::VertexAttributeFormat::SHORT) ? 2 : 1);
}
- const float srcval = (vertex_attribute_formats[i] == Regs::VertexAttributeFormat::BYTE) ? *(s8*)srcdata :
- (vertex_attribute_formats[i] == Regs::VertexAttributeFormat::UBYTE) ? *(u8*)srcdata :
- (vertex_attribute_formats[i] == Regs::VertexAttributeFormat::SHORT) ? *(s16*)srcdata :
- *(float*)srcdata;
+ const float srcval =
+ (vertex_attribute_formats[i] == Regs::VertexAttributeFormat::BYTE) ? *reinterpret_cast<const s8*>(srcdata) :
+ (vertex_attribute_formats[i] == Regs::VertexAttributeFormat::UBYTE) ? *reinterpret_cast<const u8*>(srcdata) :
+ (vertex_attribute_formats[i] == Regs::VertexAttributeFormat::SHORT) ? *reinterpret_cast<const s16*>(srcdata) :
+ *reinterpret_cast<const float*>(srcdata);
input.attr[i][comp] = float24::FromFloat32(srcval);
LOG_TRACE(HW_GPU, "Loaded component %x of attribute %x for vertex %x (index %x) from 0x%08x + 0x%08x + 0x%04x: %f",
diff --git a/src/video_core/debug_utils/debug_utils.cpp b/src/video_core/debug_utils/debug_utils.cpp
index 271e81ca1..bac6d69c7 100644
--- a/src/video_core/debug_utils/debug_utils.cpp
+++ b/src/video_core/debug_utils/debug_utils.cpp
@@ -117,13 +117,13 @@ void GeometryDumper::Dump() {
void DumpShader(const std::string& filename, const Regs::ShaderConfig& config, const Shader::ShaderSetup& setup, const Regs::VSOutputAttributes* output_attributes)
{
struct StuffToWrite {
- u8* pointer;
+ const u8* pointer;
u32 size;
};
std::vector<StuffToWrite> writing_queue;
u32 write_offset = 0;
- auto QueueForWriting = [&writing_queue,&write_offset](u8* pointer, u32 size) {
+ auto QueueForWriting = [&writing_queue,&write_offset](const u8* pointer, u32 size) {
writing_queue.push_back({pointer, size});
u32 old_write_offset = write_offset;
write_offset += size;
@@ -228,27 +228,27 @@ void DumpShader(const std::string& filename, const Regs::ShaderConfig& config, c
DVLPHeader dvlp{ DVLPHeader::MAGIC_WORD };
DVLEHeader dvle{ DVLEHeader::MAGIC_WORD };
- QueueForWriting((u8*)&dvlb, sizeof(dvlb));
- u32 dvlp_offset = QueueForWriting((u8*)&dvlp, sizeof(dvlp));
- dvlb.dvle_offset = QueueForWriting((u8*)&dvle, sizeof(dvle));
+ QueueForWriting(reinterpret_cast<const u8*>(&dvlb), sizeof(dvlb));
+ u32 dvlp_offset = QueueForWriting(reinterpret_cast<const u8*>(&dvlp), sizeof(dvlp));
+ dvlb.dvle_offset = QueueForWriting(reinterpret_cast<const u8*>(&dvle), sizeof(dvle));
// TODO: Reduce the amount of binary code written to relevant portions
dvlp.binary_offset = write_offset - dvlp_offset;
dvlp.binary_size_words = setup.program_code.size();
- QueueForWriting((u8*)setup.program_code.data(), setup.program_code.size() * sizeof(u32));
+ QueueForWriting(reinterpret_cast<const u8*>(setup.program_code.data()), setup.program_code.size() * sizeof(u32));
dvlp.swizzle_info_offset = write_offset - dvlp_offset;
dvlp.swizzle_info_num_entries = setup.swizzle_data.size();
u32 dummy = 0;
for (unsigned int i = 0; i < setup.swizzle_data.size(); ++i) {
- QueueForWriting((u8*)&setup.swizzle_data[i], sizeof(setup.swizzle_data[i]));
- QueueForWriting((u8*)&dummy, sizeof(dummy));
+ QueueForWriting(reinterpret_cast<const u8*>(&setup.swizzle_data[i]), sizeof(setup.swizzle_data[i]));
+ QueueForWriting(reinterpret_cast<const u8*>(&dummy), sizeof(dummy));
}
dvle.main_offset_words = config.main_offset;
dvle.output_register_table_offset = write_offset - dvlb.dvle_offset;
dvle.output_register_table_size = static_cast<u32>(output_info_table.size());
- QueueForWriting((u8*)output_info_table.data(), static_cast<u32>(output_info_table.size() * sizeof(OutputRegisterInfo)));
+ QueueForWriting(reinterpret_cast<const u8*>(output_info_table.data()), static_cast<u32>(output_info_table.size() * sizeof(OutputRegisterInfo)));
// TODO: Create a label table for "main"
@@ -292,14 +292,14 @@ void DumpShader(const std::string& filename, const Regs::ShaderConfig& config, c
dvle.constant_table_offset = write_offset - dvlb.dvle_offset;
dvle.constant_table_size = constant_table.size();
for (const auto& constant : constant_table) {
- QueueForWriting((uint8_t*)&constant, sizeof(constant));
+ QueueForWriting(reinterpret_cast<const u8*>(&constant), sizeof(constant));
}
// Write data to file
std::ofstream file(filename, std::ios_base::out | std::ios_base::binary);
- for (auto& chunk : writing_queue) {
- file.write((char*)chunk.pointer, chunk.size);
+ for (const auto& chunk : writing_queue) {
+ file.write(reinterpret_cast<const char*>(chunk.pointer), chunk.size);
}
}
diff --git a/src/video_core/pica.h b/src/video_core/pica.h
index 54187bcad..4b783ac6b 100644
--- a/src/video_core/pica.h
+++ b/src/video_core/pica.h
@@ -117,8 +117,8 @@ struct Regs {
INSERT_PADDING_WORDS(0x11);
union {
- BitField< 0, 16, u32> x;
- BitField<16, 16, u32> y;
+ BitField< 0, 10, s32> x;
+ BitField<16, 10, s32> y;
} viewport_corner;
INSERT_PADDING_WORDS(0x17);
@@ -1223,17 +1223,17 @@ struct Regs {
// Used for debugging purposes, so performance is not an issue here
static std::string GetCommandName(int index);
- static inline size_t NumIds() {
+ static constexpr size_t NumIds() {
return sizeof(Regs) / sizeof(u32);
}
- u32& operator [] (int index) const {
- u32* content = (u32*)this;
+ const u32& operator [] (int index) const {
+ const u32* content = reinterpret_cast<const u32*>(this);
return content[index];
}
u32& operator [] (int index) {
- u32* content = (u32*)this;
+ u32* content = reinterpret_cast<u32*>(this);
return content[index];
}
diff --git a/src/video_core/shader/shader.cpp b/src/video_core/shader/shader.cpp
index dbb8fd804..eb1db0778 100644
--- a/src/video_core/shader/shader.cpp
+++ b/src/video_core/shader/shader.cpp
@@ -32,6 +32,12 @@ namespace Shader {
static std::unordered_map<u64, CompiledShader*> shader_map;
static JitCompiler jit;
static CompiledShader* jit_shader;
+
+static void ClearCache() {
+ shader_map.clear();
+ jit.Clear();
+ LOG_INFO(HW_GPU, "Shader JIT cache cleared");
+}
#endif // ARCHITECTURE_x86_64
void Setup(UnitState<false>& state) {
@@ -45,6 +51,12 @@ void Setup(UnitState<false>& state) {
if (iter != shader_map.end()) {
jit_shader = iter->second;
} else {
+ // Check if remaining JIT code space is enough for at least one more (massive) shader
+ if (jit.GetSpaceLeft() < jit_shader_size) {
+ // If not, clear the cache of all previously compiled shaders
+ ClearCache();
+ }
+
jit_shader = jit.Compile();
shader_map.emplace(cache_key, jit_shader);
}
@@ -54,7 +66,7 @@ void Setup(UnitState<false>& state) {
void Shutdown() {
#ifdef ARCHITECTURE_x86_64
- shader_map.clear();
+ ClearCache();
#endif // ARCHITECTURE_x86_64
}
@@ -145,7 +157,7 @@ OutputVertex Run(UnitState<false>& state, const InputVertex& input, int num_attr
std::fmin(std::fabs(ret.color[i].ToFloat32()), 1.0f));
}
- LOG_TRACE(Render_Software, "Output vertex: pos(%.2f, %.2f, %.2f, %.2f), quat(%.2f, %.2f, %.2f, %.2f), "
+ LOG_TRACE(HW_GPU, "Output vertex: pos(%.2f, %.2f, %.2f, %.2f), quat(%.2f, %.2f, %.2f, %.2f), "
"col(%.2f, %.2f, %.2f, %.2f), tc0(%.2f, %.2f), view(%.2f, %.2f, %.2f)",
ret.pos.x.ToFloat32(), ret.pos.y.ToFloat32(), ret.pos.z.ToFloat32(), ret.pos.w.ToFloat32(),
ret.quat.x.ToFloat32(), ret.quat.y.ToFloat32(), ret.quat.z.ToFloat32(), ret.quat.w.ToFloat32(),
diff --git a/src/video_core/shader/shader_interpreter.cpp b/src/video_core/shader/shader_interpreter.cpp
index 79fcc56b9..9b978583e 100644
--- a/src/video_core/shader/shader_interpreter.cpp
+++ b/src/video_core/shader/shader_interpreter.cpp
@@ -2,10 +2,10 @@
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
-#include <common/file_util.h>
-
+#include <numeric>
#include <nihstro/shader_bytecode.h>
+#include "common/file_util.h"
#include "video_core/pica.h"
#include "video_core/pica_state.h"
#include "video_core/shader/shader.h"
@@ -214,10 +214,8 @@ void RunInterpreter(UnitState<Debug>& state) {
if (opcode == OpCode::Id::DPH || opcode == OpCode::Id::DPHI)
src1[3] = float24::FromFloat32(1.0f);
- float24 dot = float24::FromFloat32(0.f);
int num_components = (opcode == OpCode::Id::DP3) ? 3 : 4;
- for (int i = 0; i < num_components; ++i)
- dot = dot + src1[i] * src2[i];
+ float24 dot = std::inner_product(src1, src1 + num_components, src2, float24::FromFloat32(0.f));
for (int i = 0; i < 4; ++i) {
if (!swizzle.DestComponentEnabled(i))
@@ -409,13 +407,16 @@ void RunInterpreter(UnitState<Debug>& state) {
{
if ((instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MAD) ||
(instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MADI)) {
- const SwizzlePattern& swizzle = *(SwizzlePattern*)&swizzle_data[instr.mad.operand_desc_id];
+ const SwizzlePattern& swizzle = *reinterpret_cast<const SwizzlePattern*>(&swizzle_data[instr.mad.operand_desc_id]);
bool is_inverted = (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MADI);
+ const int address_offset = (instr.mad.address_register_index == 0)
+ ? 0 : state.address_registers[instr.mad.address_register_index - 1];
+
const float24* src1_ = LookupSourceRegister(instr.mad.GetSrc1(is_inverted));
- const float24* src2_ = LookupSourceRegister(instr.mad.GetSrc2(is_inverted));
- const float24* src3_ = LookupSourceRegister(instr.mad.GetSrc3(is_inverted));
+ const float24* src2_ = LookupSourceRegister(instr.mad.GetSrc2(is_inverted) + (!is_inverted * address_offset));
+ const float24* src3_ = LookupSourceRegister(instr.mad.GetSrc3(is_inverted) + ( is_inverted * address_offset));
const bool negate_src1 = ((bool)swizzle.negate_src1 != false);
const bool negate_src2 = ((bool)swizzle.negate_src2 != false);
diff --git a/src/video_core/shader/shader_jit_x64.cpp b/src/video_core/shader/shader_jit_x64.cpp
index 5083d7e54..dffe051ef 100644
--- a/src/video_core/shader/shader_jit_x64.cpp
+++ b/src/video_core/shader/shader_jit_x64.cpp
@@ -160,40 +160,41 @@ void JitCompiler::Compile_SwizzleSrc(Instruction instr, unsigned src_num, Source
ASSERT_MSG(src_offset == src_offset_disp, "Source register offset too large for int type");
unsigned operand_desc_id;
+
+ const bool is_inverted = (0 != (instr.opcode.Value().GetInfo().subtype & OpCode::Info::SrcInversed));
+
+ unsigned address_register_index;
+ unsigned offset_src;
+
if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MAD ||
instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MADI) {
- // The MAD and MADI instructions do not use the address offset registers, so loading the
- // source is a bit simpler here
-
operand_desc_id = instr.mad.operand_desc_id;
-
- // Load the source
- MOVAPS(dest, MDisp(src_ptr, src_offset_disp));
+ offset_src = is_inverted ? 3 : 2;
+ address_register_index = instr.mad.address_register_index;
} else {
operand_desc_id = instr.common.operand_desc_id;
+ offset_src = is_inverted ? 2 : 1;
+ address_register_index = instr.common.address_register_index;
+ }
- const bool is_inverted = (0 != (instr.opcode.Value().GetInfo().subtype & OpCode::Info::SrcInversed));
- unsigned offset_src = is_inverted ? 2 : 1;
-
- if (src_num == offset_src && instr.common.address_register_index != 0) {
- switch (instr.common.address_register_index) {
- case 1: // address offset 1
- MOVAPS(dest, MComplex(src_ptr, ADDROFFS_REG_0, SCALE_1, src_offset_disp));
- break;
- case 2: // address offset 2
- MOVAPS(dest, MComplex(src_ptr, ADDROFFS_REG_1, SCALE_1, src_offset_disp));
- break;
- case 3: // address offset 3
- MOVAPS(dest, MComplex(src_ptr, LOOPCOUNT_REG, SCALE_1, src_offset_disp));
- break;
- default:
- UNREACHABLE();
- break;
- }
- } else {
- // Load the source
- MOVAPS(dest, MDisp(src_ptr, src_offset_disp));
+ if (src_num == offset_src && address_register_index != 0) {
+ switch (address_register_index) {
+ case 1: // address offset 1
+ MOVAPS(dest, MComplex(src_ptr, ADDROFFS_REG_0, SCALE_1, src_offset_disp));
+ break;
+ case 2: // address offset 2
+ MOVAPS(dest, MComplex(src_ptr, ADDROFFS_REG_1, SCALE_1, src_offset_disp));
+ break;
+ case 3: // address offset 3
+ MOVAPS(dest, MComplex(src_ptr, LOOPCOUNT_REG, SCALE_1, src_offset_disp));
+ break;
+ default:
+ UNREACHABLE();
+ break;
}
+ } else {
+ // Load the source
+ MOVAPS(dest, MDisp(src_ptr, src_offset_disp));
}
SwizzlePattern swiz = { g_state.vs.swizzle_data[operand_desc_id] };
@@ -644,7 +645,8 @@ void JitCompiler::Compile_MAD(Instruction instr) {
}
void JitCompiler::Compile_IF(Instruction instr) {
- ASSERT_MSG(instr.flow_control.dest_offset > *offset_ptr, "Backwards if-statements not supported");
+ ASSERT_MSG(instr.flow_control.dest_offset > *offset_ptr, "Backwards if-statements (%d -> %d) not supported",
+ *offset_ptr, instr.flow_control.dest_offset.Value());
// Evaluate the "IF" condition
if (instr.opcode.Value() == OpCode::Id::IFU) {
@@ -675,7 +677,8 @@ void JitCompiler::Compile_IF(Instruction instr) {
}
void JitCompiler::Compile_LOOP(Instruction instr) {
- ASSERT_MSG(instr.flow_control.dest_offset > *offset_ptr, "Backwards loops not supported");
+ ASSERT_MSG(instr.flow_control.dest_offset > *offset_ptr, "Backwards loops (%d -> %d) not supported",
+ *offset_ptr, instr.flow_control.dest_offset.Value());
ASSERT_MSG(!looping, "Nested loops not supported");
looping = true;
@@ -703,7 +706,8 @@ void JitCompiler::Compile_LOOP(Instruction instr) {
}
void JitCompiler::Compile_JMP(Instruction instr) {
- ASSERT_MSG(instr.flow_control.dest_offset > *offset_ptr, "Backwards jumps not supported");
+ ASSERT_MSG(instr.flow_control.dest_offset > *offset_ptr, "Backwards jumps (%d -> %d) not supported",
+ *offset_ptr, instr.flow_control.dest_offset.Value());
if (instr.opcode.Value() == OpCode::Id::JMPC)
Compile_EvaluateCondition(instr);
@@ -747,7 +751,7 @@ void JitCompiler::Compile_NextInstr(unsigned* offset) {
} else {
// Unhandled instruction
LOG_CRITICAL(HW_GPU, "Unhandled instruction: 0x%02x (0x%08x)",
- instr.opcode.Value().EffectiveOpCode(), instr.hex);
+ instr.opcode.Value().EffectiveOpCode(), instr.hex);
}
}
@@ -786,7 +790,7 @@ CompiledShader* JitCompiler::Compile() {
}
JitCompiler::JitCompiler() {
- AllocCodeSpace(1024 * 1024 * 4);
+ AllocCodeSpace(jit_cache_size);
}
void JitCompiler::Clear() {
diff --git a/src/video_core/shader/shader_jit_x64.h b/src/video_core/shader/shader_jit_x64.h
index 5ad2d9606..5357c964b 100644
--- a/src/video_core/shader/shader_jit_x64.h
+++ b/src/video_core/shader/shader_jit_x64.h
@@ -19,6 +19,11 @@ namespace Pica {
namespace Shader {
+/// Memory needed to be available to compile the next shader (otherwise, clear the cache)
+constexpr size_t jit_shader_size = 1024 * 512;
+/// Memory allocated for the JIT code space cache
+constexpr size_t jit_cache_size = 1024 * 1024 * 8;
+
using CompiledShader = void(void* registers);
/**