From baa24f4ea9d9c4d7c1bd60ba8a6fc188dfa9cc8f Mon Sep 17 00:00:00 2001 From: wwylele Date: Thu, 3 Aug 2017 01:40:42 +0300 Subject: pica: upload shared shader code to both unit --- src/video_core/command_processor.cpp | 62 +++++++++++++++++++++--------------- 1 file changed, 37 insertions(+), 25 deletions(-) (limited to 'src/video_core/command_processor.cpp') diff --git a/src/video_core/command_processor.cpp b/src/video_core/command_processor.cpp index 4633a1df1..f98ca3302 100644 --- a/src/video_core/command_processor.cpp +++ b/src/video_core/command_processor.cpp @@ -119,27 +119,6 @@ static void WriteUniformFloatReg(ShaderRegs& config, Shader::ShaderSetup& setup, } } -static void WriteProgramCode(ShaderRegs& config, Shader::ShaderSetup& setup, - unsigned max_program_code_length, u32 value) { - if (config.program.offset >= max_program_code_length) { - LOG_ERROR(HW_GPU, "Invalid %s program offset %d", GetShaderSetupTypeName(setup), - (int)config.program.offset); - } else { - setup.program_code[config.program.offset] = value; - config.program.offset++; - } -} - -static void WriteSwizzlePatterns(ShaderRegs& config, Shader::ShaderSetup& setup, u32 value) { - if (config.swizzle_patterns.offset >= setup.swizzle_data.size()) { - LOG_ERROR(HW_GPU, "Invalid %s swizzle pattern offset %d", GetShaderSetupTypeName(setup), - (int)config.swizzle_patterns.offset); - } else { - setup.swizzle_data[config.swizzle_patterns.offset] = value; - config.swizzle_patterns.offset++; - } -} - static void WritePicaReg(u32 id, u32 value, u32 mask) { auto& regs = g_state.regs; @@ -458,7 +437,13 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { case PICA_REG_INDEX_WORKAROUND(gs.program.set_word[5], 0x2a1): case PICA_REG_INDEX_WORKAROUND(gs.program.set_word[6], 0x2a2): case PICA_REG_INDEX_WORKAROUND(gs.program.set_word[7], 0x2a3): { - WriteProgramCode(g_state.regs.gs, g_state.gs, 4096, value); + u32& offset = g_state.regs.gs.program.offset; + if (offset >= 4096) { + LOG_ERROR(HW_GPU, "Invalid GS program offset %u", offset); + } else { + g_state.gs.program_code[offset] = value; + offset++; + } break; } @@ -470,11 +455,18 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { case PICA_REG_INDEX_WORKAROUND(gs.swizzle_patterns.set_word[5], 0x2ab): case PICA_REG_INDEX_WORKAROUND(gs.swizzle_patterns.set_word[6], 0x2ac): case PICA_REG_INDEX_WORKAROUND(gs.swizzle_patterns.set_word[7], 0x2ad): { - WriteSwizzlePatterns(g_state.regs.gs, g_state.gs, value); + u32& offset = g_state.regs.gs.swizzle_patterns.offset; + if (offset >= g_state.gs.swizzle_data.size()) { + LOG_ERROR(HW_GPU, "Invalid GS swizzle pattern offset %u", offset); + } else { + g_state.gs.swizzle_data[offset] = value; + offset++; + } break; } case PICA_REG_INDEX(vs.bool_uniforms): + // TODO (wwylele): does regs.pipeline.gs_unit_exclusive_configuration affect this? WriteUniformBoolReg(g_state.vs, g_state.regs.vs.bool_uniforms.Value()); break; @@ -482,6 +474,7 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { case PICA_REG_INDEX_WORKAROUND(vs.int_uniforms[1], 0x2b2): case PICA_REG_INDEX_WORKAROUND(vs.int_uniforms[2], 0x2b3): case PICA_REG_INDEX_WORKAROUND(vs.int_uniforms[3], 0x2b4): { + // TODO (wwylele): does regs.pipeline.gs_unit_exclusive_configuration affect this? unsigned index = (id - PICA_REG_INDEX_WORKAROUND(vs.int_uniforms[0], 0x2b1)); auto values = regs.vs.int_uniforms[index]; WriteUniformIntReg(g_state.vs, index, @@ -497,6 +490,7 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { case PICA_REG_INDEX_WORKAROUND(vs.uniform_setup.set_value[5], 0x2c6): case PICA_REG_INDEX_WORKAROUND(vs.uniform_setup.set_value[6], 0x2c7): case PICA_REG_INDEX_WORKAROUND(vs.uniform_setup.set_value[7], 0x2c8): { + // TODO (wwylele): does regs.pipeline.gs_unit_exclusive_configuration affect this? WriteUniformFloatReg(g_state.regs.vs, g_state.vs, vs_float_regs_counter, vs_uniform_write_buffer, value); break; @@ -510,7 +504,16 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { case PICA_REG_INDEX_WORKAROUND(vs.program.set_word[5], 0x2d1): case PICA_REG_INDEX_WORKAROUND(vs.program.set_word[6], 0x2d2): case PICA_REG_INDEX_WORKAROUND(vs.program.set_word[7], 0x2d3): { - WriteProgramCode(g_state.regs.vs, g_state.vs, 512, value); + u32& offset = g_state.regs.vs.program.offset; + if (offset >= 512) { + LOG_ERROR(HW_GPU, "Invalid VS program offset %u", offset); + } else { + g_state.vs.program_code[offset] = value; + if (!g_state.regs.pipeline.gs_unit_exclusive_configuration) { + g_state.gs.program_code[offset] = value; + } + offset++; + } break; } @@ -522,7 +525,16 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { case PICA_REG_INDEX_WORKAROUND(vs.swizzle_patterns.set_word[5], 0x2db): case PICA_REG_INDEX_WORKAROUND(vs.swizzle_patterns.set_word[6], 0x2dc): case PICA_REG_INDEX_WORKAROUND(vs.swizzle_patterns.set_word[7], 0x2dd): { - WriteSwizzlePatterns(g_state.regs.vs, g_state.vs, value); + u32& offset = g_state.regs.vs.swizzle_patterns.offset; + if (offset >= g_state.vs.swizzle_data.size()) { + LOG_ERROR(HW_GPU, "Invalid VS swizzle pattern offset %u", offset); + } else { + g_state.vs.swizzle_data[offset] = value; + if (!g_state.regs.pipeline.gs_unit_exclusive_configuration) { + g_state.gs.swizzle_data[offset] = value; + } + offset++; + } break; } -- cgit v1.2.3 From 0f35755572fe63534813528de9a0710193f2e335 Mon Sep 17 00:00:00 2001 From: wwylele Date: Fri, 4 Aug 2017 17:03:17 +0300 Subject: pica/command_processor: build geometry pipeline and run geometry shader The geometry pipeline manages data transfer between VS, GS and primitive assembler. It has known four modes: - no GS mode: sends VS output directly to the primitive assembler (what citra currently does) - GS mode 0: sends VS output to GS input registers, and sends GS output to primitive assembler - GS mode 1: sends VS output to GS uniform registers, and sends GS output to primitive assembler. It also takes an index from the index buffer at the beginning of each primitive for determine the primitive size. - GS mode 2: similar to mode 1, but doesn't take the index and uses a fixed primitive size. hwtest shows that immediate mode also supports GS (at least for mode 0), so the geometry pipeline gets refactored into its own class for supporting both drawing mode. In the immediate mode, some games don't set the pipeline registers to a valid value until the first attribute input, so a geometry pipeline reset flag is set in `pipeline.vs_default_attributes_setup.index` trigger, and the actual pipeline reconfigure is triggered in the first attribute input. In the normal drawing mode with index buffer, the vertex cache is a little bit modified to support the geometry pipeline. Instead of OutputVertex, it now holds AttributeBuffer, which is the input to the geometry pipeline. The AttributeBuffer->OutputVertex conversion is done inside the pipeline vertex handler. The actual hardware vertex cache is believed to be implemented in a similar way (because this is the only way that makes sense). Both geometry pipeline and GS unit rely on states preservation across drawing call, so they are put into the global state. In the future, the other three vertex shader units should be also placed in the global state, and a scheduler should be implemented on top of the four units. Note that the current gs_unit already allows running VS on it in the future. --- src/video_core/command_processor.cpp | 54 ++++++++++++++++++------------------ 1 file changed, 27 insertions(+), 27 deletions(-) (limited to 'src/video_core/command_processor.cpp') diff --git a/src/video_core/command_processor.cpp b/src/video_core/command_processor.cpp index f98ca3302..fb65a3a0a 100644 --- a/src/video_core/command_processor.cpp +++ b/src/video_core/command_processor.cpp @@ -161,6 +161,7 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { case PICA_REG_INDEX(pipeline.vs_default_attributes_setup.index): g_state.immediate.current_attribute = 0; + g_state.immediate.reset_geometry_pipeline = true; default_attr_counter = 0; break; @@ -234,16 +235,14 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { shader_engine->Run(g_state.vs, shader_unit); shader_unit.WriteOutput(regs.vs, output); - // Send to renderer - using Pica::Shader::OutputVertex; - auto AddTriangle = [](const OutputVertex& v0, const OutputVertex& v1, - const OutputVertex& v2) { - VideoCore::g_renderer->Rasterizer()->AddTriangle(v0, v1, v2); - }; - - g_state.primitive_assembler.SubmitVertex( - Shader::OutputVertex::FromAttributeBuffer(regs.rasterizer, output), - AddTriangle); + // Send to geometry pipeline + if (g_state.immediate.reset_geometry_pipeline) { + g_state.geometry_pipeline.Reconfigure(); + g_state.immediate.reset_geometry_pipeline = false; + } + ASSERT(!g_state.geometry_pipeline.NeedIndexInput()); + g_state.geometry_pipeline.Setup(shader_engine); + g_state.geometry_pipeline.SubmitVertex(output); } } } @@ -321,8 +320,8 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { // The size has been tuned for optimal balance between hit-rate and the cost of lookup const size_t VERTEX_CACHE_SIZE = 32; std::array vertex_cache_ids; - std::array vertex_cache; - Shader::OutputVertex output_vertex; + std::array vertex_cache; + Shader::AttributeBuffer vs_output; unsigned int vertex_cache_pos = 0; vertex_cache_ids.fill(-1); @@ -332,6 +331,11 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { shader_engine->SetupBatch(g_state.vs, regs.vs.main_offset); + g_state.geometry_pipeline.Reconfigure(); + g_state.geometry_pipeline.Setup(shader_engine); + if (g_state.geometry_pipeline.NeedIndexInput()) + ASSERT(is_indexed); + for (unsigned int index = 0; index < regs.pipeline.num_vertices; ++index) { // Indexed rendering doesn't use the start offset unsigned int vertex = @@ -345,6 +349,11 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { bool vertex_cache_hit = false; if (is_indexed) { + if (g_state.geometry_pipeline.NeedIndexInput()) { + g_state.geometry_pipeline.SubmitIndex(vertex); + continue; + } + if (g_debug_context && Pica::g_debug_context->recorder) { int size = index_u16 ? 2 : 1; memory_accesses.AddAccess(base_address + index_info.offset + size * index, @@ -353,7 +362,7 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { for (unsigned int i = 0; i < VERTEX_CACHE_SIZE; ++i) { if (vertex == vertex_cache_ids[i]) { - output_vertex = vertex_cache[i]; + vs_output = vertex_cache[i]; vertex_cache_hit = true; break; } @@ -362,7 +371,7 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { if (!vertex_cache_hit) { // Initialize data for the current vertex - Shader::AttributeBuffer input, output{}; + Shader::AttributeBuffer input; loader.LoadVertex(base_address, index, vertex, input, memory_accesses); // Send to vertex shader @@ -371,26 +380,17 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { (void*)&input); shader_unit.LoadInput(regs.vs, input); shader_engine->Run(g_state.vs, shader_unit); - shader_unit.WriteOutput(regs.vs, output); - - // Retrieve vertex from register data - output_vertex = Shader::OutputVertex::FromAttributeBuffer(regs.rasterizer, output); + shader_unit.WriteOutput(regs.vs, vs_output); if (is_indexed) { - vertex_cache[vertex_cache_pos] = output_vertex; + vertex_cache[vertex_cache_pos] = vs_output; vertex_cache_ids[vertex_cache_pos] = vertex; vertex_cache_pos = (vertex_cache_pos + 1) % VERTEX_CACHE_SIZE; } } - // Send to renderer - using Pica::Shader::OutputVertex; - auto AddTriangle = [](const OutputVertex& v0, const OutputVertex& v1, - const OutputVertex& v2) { - VideoCore::g_renderer->Rasterizer()->AddTriangle(v0, v1, v2); - }; - - primitive_assembler.SubmitVertex(output_vertex, AddTriangle); + // Send to geometry pipeline + g_state.geometry_pipeline.SubmitVertex(vs_output); } for (auto& range : memory_accesses.ranges) { -- cgit v1.2.3 From ad0b57f4071fb7ec9da764b3905e0bb5e4c5eef2 Mon Sep 17 00:00:00 2001 From: James Rowe Date: Thu, 7 Sep 2017 22:05:42 -0600 Subject: GPU: Add draw for immediate and batch modes PR #1461 introduced a regression where some games would change configuration even while in the poorly named "drawing" mode, which broke the heuristic citra was using to determine when to draw the batch. This change adds back in a draw call for batching, and also adds in a draw call in immediate mode each time it adds a triangle. --- src/video_core/command_processor.cpp | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) (limited to 'src/video_core/command_processor.cpp') diff --git a/src/video_core/command_processor.cpp b/src/video_core/command_processor.cpp index fb65a3a0a..fff159058 100644 --- a/src/video_core/command_processor.cpp +++ b/src/video_core/command_processor.cpp @@ -243,6 +243,15 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { ASSERT(!g_state.geometry_pipeline.NeedIndexInput()); g_state.geometry_pipeline.Setup(shader_engine); g_state.geometry_pipeline.SubmitVertex(output); + + // TODO: If drawing after every immediate mode triangle kills performance, + // change it to flush triangles whenever a draing config register changes + // See: https://github.com/citra-emu/citra/pull/2866#issuecomment-327011550 + VideoCore::g_renderer->Rasterizer()->DrawTriangles(); + if (g_debug_context) { + g_debug_context->OnEvent(DebugContext::Event::FinishedPrimitiveBatch, + nullptr); + } } } } @@ -398,6 +407,12 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { range.second, range.first); } + MICROPROFILE_SCOPE(GPU_Drawing); + VideoCore::g_renderer->Rasterizer()->DrawTriangles(); + if (g_debug_context) { + g_debug_context->OnEvent(DebugContext::Event::FinishedPrimitiveBatch, nullptr); + } + break; } @@ -632,6 +647,6 @@ void ProcessCommandList(const u32* list, u32 size) { } } -} // namespace +} // namespace CommandProcessor -} // namespace +} // namespace Pica -- cgit v1.2.3 From 19d41dcc6e6892125f1123b34db3dc284f04b744 Mon Sep 17 00:00:00 2001 From: James Rowe Date: Sat, 23 Sep 2017 09:28:20 -0600 Subject: Remove pipeline.gpu_mode and fix minor issues --- src/video_core/command_processor.cpp | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) (limited to 'src/video_core/command_processor.cpp') diff --git a/src/video_core/command_processor.cpp b/src/video_core/command_processor.cpp index fff159058..3ab4af374 100644 --- a/src/video_core/command_processor.cpp +++ b/src/video_core/command_processor.cpp @@ -245,7 +245,7 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { g_state.geometry_pipeline.SubmitVertex(output); // TODO: If drawing after every immediate mode triangle kills performance, - // change it to flush triangles whenever a draing config register changes + // change it to flush triangles whenever a drawing config register changes // See: https://github.com/citra-emu/citra/pull/2866#issuecomment-327011550 VideoCore::g_renderer->Rasterizer()->DrawTriangles(); if (g_debug_context) { @@ -259,16 +259,7 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { } case PICA_REG_INDEX(pipeline.gpu_mode): - if (regs.pipeline.gpu_mode == PipelineRegs::GPUMode::Configuring) { - MICROPROFILE_SCOPE(GPU_Drawing); - - // Draw immediate mode triangles when GPU Mode is set to GPUMode::Configuring - VideoCore::g_renderer->Rasterizer()->DrawTriangles(); - - if (g_debug_context) { - g_debug_context->OnEvent(DebugContext::Event::FinishedPrimitiveBatch, nullptr); - } - } + // This register likely just enables vertex processing and doesn't need any special handling break; case PICA_REG_INDEX_WORKAROUND(pipeline.command_buffer.trigger[0], 0x23c): @@ -407,7 +398,6 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { range.second, range.first); } - MICROPROFILE_SCOPE(GPU_Drawing); VideoCore::g_renderer->Rasterizer()->DrawTriangles(); if (g_debug_context) { g_debug_context->OnEvent(DebugContext::Event::FinishedPrimitiveBatch, nullptr); -- cgit v1.2.3 From b3b34a1e76664c412fd7b37b3529cadd3983acfb Mon Sep 17 00:00:00 2001 From: Huw Pascoe Date: Tue, 3 Oct 2017 12:21:37 +0100 Subject: Extracted the attribute setup and draw commands into their own functions --- src/video_core/command_processor.cpp | 439 ++++++++++++++++++----------------- 1 file changed, 222 insertions(+), 217 deletions(-) (limited to 'src/video_core/command_processor.cpp') diff --git a/src/video_core/command_processor.cpp b/src/video_core/command_processor.cpp index 3ab4af374..caf9f7a06 100644 --- a/src/video_core/command_processor.cpp +++ b/src/video_core/command_processor.cpp @@ -119,6 +119,224 @@ static void WriteUniformFloatReg(ShaderRegs& config, Shader::ShaderSetup& setup, } } +static void LoadDefaultVertexAttributes(u32 register_value) { + auto& regs = g_state.regs; + + // TODO: Does actual hardware indeed keep an intermediate buffer or does + // it directly write the values? + default_attr_write_buffer[default_attr_counter++] = register_value; + + // Default attributes are written in a packed format such that four float24 values are encoded + // in three 32-bit numbers. + // We write to internal memory once a full such vector is written. + if (default_attr_counter >= 3) { + default_attr_counter = 0; + + auto& setup = regs.pipeline.vs_default_attributes_setup; + + if (setup.index >= 16) { + LOG_ERROR(HW_GPU, "Invalid VS default attribute index %d", (int)setup.index); + return; + } + + Math::Vec4 attribute; + + // NOTE: The destination component order indeed is "backwards" + attribute.w = float24::FromRaw(default_attr_write_buffer[0] >> 8); + attribute.z = float24::FromRaw(((default_attr_write_buffer[0] & 0xFF) << 16) | + ((default_attr_write_buffer[1] >> 16) & 0xFFFF)); + attribute.y = float24::FromRaw(((default_attr_write_buffer[1] & 0xFFFF) << 8) | + ((default_attr_write_buffer[2] >> 24) & 0xFF)); + attribute.x = float24::FromRaw(default_attr_write_buffer[2] & 0xFFFFFF); + + LOG_TRACE(HW_GPU, "Set default VS attribute %x to (%f %f %f %f)", (int)setup.index, + attribute.x.ToFloat32(), attribute.y.ToFloat32(), attribute.z.ToFloat32(), + attribute.w.ToFloat32()); + + // TODO: Verify that this actually modifies the register! + if (setup.index < 15) { + g_state.input_default_attributes.attr[setup.index] = attribute; + setup.index++; + } else { + // Put each attribute into an immediate input buffer. When all specified immediate + // attributes are present, the Vertex Shader is invoked and everything is sent to + // the primitive assembler. + + auto& immediate_input = g_state.immediate.input_vertex; + auto& immediate_attribute_id = g_state.immediate.current_attribute; + + immediate_input.attr[immediate_attribute_id] = attribute; + + if (immediate_attribute_id < regs.pipeline.max_input_attrib_index) { + immediate_attribute_id += 1; + } else { + MICROPROFILE_SCOPE(GPU_Drawing); + immediate_attribute_id = 0; + + auto* shader_engine = Shader::GetEngine(); + shader_engine->SetupBatch(g_state.vs, regs.vs.main_offset); + + // Send to vertex shader + if (g_debug_context) + g_debug_context->OnEvent(DebugContext::Event::VertexShaderInvocation, + static_cast(&immediate_input)); + Shader::UnitState shader_unit; + Shader::AttributeBuffer output{}; + + shader_unit.LoadInput(regs.vs, immediate_input); + shader_engine->Run(g_state.vs, shader_unit); + shader_unit.WriteOutput(regs.vs, output); + + // Send to geometry pipeline + if (g_state.immediate.reset_geometry_pipeline) { + g_state.geometry_pipeline.Reconfigure(); + g_state.immediate.reset_geometry_pipeline = false; + } + ASSERT(!g_state.geometry_pipeline.NeedIndexInput()); + g_state.geometry_pipeline.Setup(shader_engine); + g_state.geometry_pipeline.SubmitVertex(output); + + // TODO: If drawing after every immediate mode triangle kills performance, + // change it to flush triangles whenever a drawing config register changes + // See: https://github.com/citra-emu/citra/pull/2866#issuecomment-327011550 + VideoCore::g_renderer->Rasterizer()->DrawTriangles(); + if (g_debug_context) { + g_debug_context->OnEvent(DebugContext::Event::FinishedPrimitiveBatch, nullptr); + } + } + } + } +} + +static void Draw(u32 command_id) { + MICROPROFILE_SCOPE(GPU_Drawing); + auto& regs = g_state.regs; + +#if PICA_LOG_TEV + DebugUtils::DumpTevStageConfig(regs.GetTevStages()); +#endif + if (g_debug_context) + g_debug_context->OnEvent(DebugContext::Event::IncomingPrimitiveBatch, nullptr); + + // Processes information about internal vertex attributes to figure out how a vertex is + // loaded. + // Later, these can be compiled and cached. + const u32 base_address = regs.pipeline.vertex_attributes.GetPhysicalBaseAddress(); + VertexLoader loader(regs.pipeline); + + // Load vertices + bool is_indexed = (command_id == PICA_REG_INDEX(pipeline.trigger_draw_indexed)); + + const auto& index_info = regs.pipeline.index_array; + const u8* index_address_8 = Memory::GetPhysicalPointer(base_address + index_info.offset); + const u16* index_address_16 = reinterpret_cast(index_address_8); + bool index_u16 = index_info.format != 0; + + PrimitiveAssembler& primitive_assembler = g_state.primitive_assembler; + + if (g_debug_context && g_debug_context->recorder) { + for (int i = 0; i < 3; ++i) { + const auto texture = regs.texturing.GetTextures()[i]; + if (!texture.enabled) + continue; + + u8* texture_data = Memory::GetPhysicalPointer(texture.config.GetPhysicalAddress()); + g_debug_context->recorder->MemoryAccessed( + texture_data, Pica::TexturingRegs::NibblesPerPixel(texture.format) * + texture.config.width / 2 * texture.config.height, + texture.config.GetPhysicalAddress()); + } + } + + DebugUtils::MemoryAccessTracker memory_accesses; + + // Simple circular-replacement vertex cache + // The size has been tuned for optimal balance between hit-rate and the cost of lookup + const size_t VERTEX_CACHE_SIZE = 32; + std::array vertex_cache_ids; + std::array vertex_cache; + Shader::AttributeBuffer vs_output; + + unsigned int vertex_cache_pos = 0; + vertex_cache_ids.fill(-1); + + auto* shader_engine = Shader::GetEngine(); + Shader::UnitState shader_unit; + + shader_engine->SetupBatch(g_state.vs, regs.vs.main_offset); + + g_state.geometry_pipeline.Reconfigure(); + g_state.geometry_pipeline.Setup(shader_engine); + if (g_state.geometry_pipeline.NeedIndexInput()) + ASSERT(is_indexed); + + for (unsigned int index = 0; index < regs.pipeline.num_vertices; ++index) { + // Indexed rendering doesn't use the start offset + unsigned int vertex = is_indexed + ? (index_u16 ? index_address_16[index] : index_address_8[index]) + : (index + regs.pipeline.vertex_offset); + + // -1 is a common special value used for primitive restart. Since it's unknown if + // the PICA supports it, and it would mess up the caching, guard against it here. + ASSERT(vertex != -1); + + bool vertex_cache_hit = false; + + if (is_indexed) { + if (g_state.geometry_pipeline.NeedIndexInput()) { + g_state.geometry_pipeline.SubmitIndex(vertex); + continue; + } + + if (g_debug_context && Pica::g_debug_context->recorder) { + int size = index_u16 ? 2 : 1; + memory_accesses.AddAccess(base_address + index_info.offset + size * index, size); + } + + for (unsigned int i = 0; i < VERTEX_CACHE_SIZE; ++i) { + if (vertex == vertex_cache_ids[i]) { + vs_output = vertex_cache[i]; + vertex_cache_hit = true; + break; + } + } + } + + if (!vertex_cache_hit) { + // Initialize data for the current vertex + Shader::AttributeBuffer input; + loader.LoadVertex(base_address, index, vertex, input, memory_accesses); + + // Send to vertex shader + if (g_debug_context) + g_debug_context->OnEvent(DebugContext::Event::VertexShaderInvocation, + (void*)&input); + shader_unit.LoadInput(regs.vs, input); + shader_engine->Run(g_state.vs, shader_unit); + shader_unit.WriteOutput(regs.vs, vs_output); + + if (is_indexed) { + vertex_cache[vertex_cache_pos] = vs_output; + vertex_cache_ids[vertex_cache_pos] = vertex; + vertex_cache_pos = (vertex_cache_pos + 1) % VERTEX_CACHE_SIZE; + } + } + + // Send to geometry pipeline + g_state.geometry_pipeline.SubmitVertex(vs_output); + } + + for (auto& range : memory_accesses.ranges) { + g_debug_context->recorder->MemoryAccessed(Memory::GetPhysicalPointer(range.first), + range.second, range.first); + } + + VideoCore::g_renderer->Rasterizer()->DrawTriangles(); + if (g_debug_context) { + g_debug_context->OnEvent(DebugContext::Event::FinishedPrimitiveBatch, nullptr); + } +} + static void WritePicaReg(u32 id, u32 value, u32 mask) { auto& regs = g_state.regs; @@ -168,95 +386,9 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { // Load default vertex input attributes case PICA_REG_INDEX_WORKAROUND(pipeline.vs_default_attributes_setup.set_value[0], 0x233): case PICA_REG_INDEX_WORKAROUND(pipeline.vs_default_attributes_setup.set_value[1], 0x234): - case PICA_REG_INDEX_WORKAROUND(pipeline.vs_default_attributes_setup.set_value[2], 0x235): { - // TODO: Does actual hardware indeed keep an intermediate buffer or does - // it directly write the values? - default_attr_write_buffer[default_attr_counter++] = value; - - // Default attributes are written in a packed format such that four float24 values are - // encoded in - // three 32-bit numbers. We write to internal memory once a full such vector is - // written. - if (default_attr_counter >= 3) { - default_attr_counter = 0; - - auto& setup = regs.pipeline.vs_default_attributes_setup; - - if (setup.index >= 16) { - LOG_ERROR(HW_GPU, "Invalid VS default attribute index %d", (int)setup.index); - break; - } - - Math::Vec4 attribute; - - // NOTE: The destination component order indeed is "backwards" - attribute.w = float24::FromRaw(default_attr_write_buffer[0] >> 8); - attribute.z = float24::FromRaw(((default_attr_write_buffer[0] & 0xFF) << 16) | - ((default_attr_write_buffer[1] >> 16) & 0xFFFF)); - attribute.y = float24::FromRaw(((default_attr_write_buffer[1] & 0xFFFF) << 8) | - ((default_attr_write_buffer[2] >> 24) & 0xFF)); - attribute.x = float24::FromRaw(default_attr_write_buffer[2] & 0xFFFFFF); - - LOG_TRACE(HW_GPU, "Set default VS attribute %x to (%f %f %f %f)", (int)setup.index, - attribute.x.ToFloat32(), attribute.y.ToFloat32(), attribute.z.ToFloat32(), - attribute.w.ToFloat32()); - - // TODO: Verify that this actually modifies the register! - if (setup.index < 15) { - g_state.input_default_attributes.attr[setup.index] = attribute; - setup.index++; - } else { - // Put each attribute into an immediate input buffer. When all specified immediate - // attributes are present, the Vertex Shader is invoked and everything is sent to - // the primitive assembler. - - auto& immediate_input = g_state.immediate.input_vertex; - auto& immediate_attribute_id = g_state.immediate.current_attribute; - - immediate_input.attr[immediate_attribute_id] = attribute; - - if (immediate_attribute_id < regs.pipeline.max_input_attrib_index) { - immediate_attribute_id += 1; - } else { - MICROPROFILE_SCOPE(GPU_Drawing); - immediate_attribute_id = 0; - - auto* shader_engine = Shader::GetEngine(); - shader_engine->SetupBatch(g_state.vs, regs.vs.main_offset); - - // Send to vertex shader - if (g_debug_context) - g_debug_context->OnEvent(DebugContext::Event::VertexShaderInvocation, - static_cast(&immediate_input)); - Shader::UnitState shader_unit; - Shader::AttributeBuffer output{}; - - shader_unit.LoadInput(regs.vs, immediate_input); - shader_engine->Run(g_state.vs, shader_unit); - shader_unit.WriteOutput(regs.vs, output); - - // Send to geometry pipeline - if (g_state.immediate.reset_geometry_pipeline) { - g_state.geometry_pipeline.Reconfigure(); - g_state.immediate.reset_geometry_pipeline = false; - } - ASSERT(!g_state.geometry_pipeline.NeedIndexInput()); - g_state.geometry_pipeline.Setup(shader_engine); - g_state.geometry_pipeline.SubmitVertex(output); - - // TODO: If drawing after every immediate mode triangle kills performance, - // change it to flush triangles whenever a drawing config register changes - // See: https://github.com/citra-emu/citra/pull/2866#issuecomment-327011550 - VideoCore::g_renderer->Rasterizer()->DrawTriangles(); - if (g_debug_context) { - g_debug_context->OnEvent(DebugContext::Event::FinishedPrimitiveBatch, - nullptr); - } - } - } - } + case PICA_REG_INDEX_WORKAROUND(pipeline.vs_default_attributes_setup.set_value[2], 0x235): + LoadDefaultVertexAttributes(value); break; - } case PICA_REG_INDEX(pipeline.gpu_mode): // This register likely just enables vertex processing and doesn't need any special handling @@ -275,136 +407,9 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { // It seems like these trigger vertex rendering case PICA_REG_INDEX(pipeline.trigger_draw): - case PICA_REG_INDEX(pipeline.trigger_draw_indexed): { - MICROPROFILE_SCOPE(GPU_Drawing); - -#if PICA_LOG_TEV - DebugUtils::DumpTevStageConfig(regs.GetTevStages()); -#endif - if (g_debug_context) - g_debug_context->OnEvent(DebugContext::Event::IncomingPrimitiveBatch, nullptr); - - // Processes information about internal vertex attributes to figure out how a vertex is - // loaded. - // Later, these can be compiled and cached. - const u32 base_address = regs.pipeline.vertex_attributes.GetPhysicalBaseAddress(); - VertexLoader loader(regs.pipeline); - - // Load vertices - bool is_indexed = (id == PICA_REG_INDEX(pipeline.trigger_draw_indexed)); - - const auto& index_info = regs.pipeline.index_array; - const u8* index_address_8 = Memory::GetPhysicalPointer(base_address + index_info.offset); - const u16* index_address_16 = reinterpret_cast(index_address_8); - bool index_u16 = index_info.format != 0; - - PrimitiveAssembler& primitive_assembler = g_state.primitive_assembler; - - if (g_debug_context && g_debug_context->recorder) { - for (int i = 0; i < 3; ++i) { - const auto texture = regs.texturing.GetTextures()[i]; - if (!texture.enabled) - continue; - - u8* texture_data = Memory::GetPhysicalPointer(texture.config.GetPhysicalAddress()); - g_debug_context->recorder->MemoryAccessed( - texture_data, Pica::TexturingRegs::NibblesPerPixel(texture.format) * - texture.config.width / 2 * texture.config.height, - texture.config.GetPhysicalAddress()); - } - } - - DebugUtils::MemoryAccessTracker memory_accesses; - - // Simple circular-replacement vertex cache - // The size has been tuned for optimal balance between hit-rate and the cost of lookup - const size_t VERTEX_CACHE_SIZE = 32; - std::array vertex_cache_ids; - std::array vertex_cache; - Shader::AttributeBuffer vs_output; - - unsigned int vertex_cache_pos = 0; - vertex_cache_ids.fill(-1); - - auto* shader_engine = Shader::GetEngine(); - Shader::UnitState shader_unit; - - shader_engine->SetupBatch(g_state.vs, regs.vs.main_offset); - - g_state.geometry_pipeline.Reconfigure(); - g_state.geometry_pipeline.Setup(shader_engine); - if (g_state.geometry_pipeline.NeedIndexInput()) - ASSERT(is_indexed); - - for (unsigned int index = 0; index < regs.pipeline.num_vertices; ++index) { - // Indexed rendering doesn't use the start offset - unsigned int vertex = - is_indexed ? (index_u16 ? index_address_16[index] : index_address_8[index]) - : (index + regs.pipeline.vertex_offset); - - // -1 is a common special value used for primitive restart. Since it's unknown if - // the PICA supports it, and it would mess up the caching, guard against it here. - ASSERT(vertex != -1); - - bool vertex_cache_hit = false; - - if (is_indexed) { - if (g_state.geometry_pipeline.NeedIndexInput()) { - g_state.geometry_pipeline.SubmitIndex(vertex); - continue; - } - - if (g_debug_context && Pica::g_debug_context->recorder) { - int size = index_u16 ? 2 : 1; - memory_accesses.AddAccess(base_address + index_info.offset + size * index, - size); - } - - for (unsigned int i = 0; i < VERTEX_CACHE_SIZE; ++i) { - if (vertex == vertex_cache_ids[i]) { - vs_output = vertex_cache[i]; - vertex_cache_hit = true; - break; - } - } - } - - if (!vertex_cache_hit) { - // Initialize data for the current vertex - Shader::AttributeBuffer input; - loader.LoadVertex(base_address, index, vertex, input, memory_accesses); - - // Send to vertex shader - if (g_debug_context) - g_debug_context->OnEvent(DebugContext::Event::VertexShaderInvocation, - (void*)&input); - shader_unit.LoadInput(regs.vs, input); - shader_engine->Run(g_state.vs, shader_unit); - shader_unit.WriteOutput(regs.vs, vs_output); - - if (is_indexed) { - vertex_cache[vertex_cache_pos] = vs_output; - vertex_cache_ids[vertex_cache_pos] = vertex; - vertex_cache_pos = (vertex_cache_pos + 1) % VERTEX_CACHE_SIZE; - } - } - - // Send to geometry pipeline - g_state.geometry_pipeline.SubmitVertex(vs_output); - } - - for (auto& range : memory_accesses.ranges) { - g_debug_context->recorder->MemoryAccessed(Memory::GetPhysicalPointer(range.first), - range.second, range.first); - } - - VideoCore::g_renderer->Rasterizer()->DrawTriangles(); - if (g_debug_context) { - g_debug_context->OnEvent(DebugContext::Event::FinishedPrimitiveBatch, nullptr); - } - + case PICA_REG_INDEX(pipeline.trigger_draw_indexed): + Draw(id); break; - } case PICA_REG_INDEX(gs.bool_uniforms): WriteUniformBoolReg(g_state.gs, g_state.regs.gs.bool_uniforms.Value()); -- cgit v1.2.3