diff options
40 files changed, 1110 insertions, 392 deletions
diff --git a/.ci/templates/build-single.yml b/.ci/templates/build-single.yml index 77eeb96b5..c411e25d1 100644 --- a/.ci/templates/build-single.yml +++ b/.ci/templates/build-single.yml @@ -14,7 +14,7 @@ steps: cacheHitVar: CACHE_RESTORED - script: chmod a+x ./.ci/scripts/$(ScriptFolder)/exec.sh && ./.ci/scripts/$(ScriptFolder)/exec.sh displayName: 'Build' -- script: chmod a+x ./.ci/scripts/$(ScriptFolder)/upload.sh && ./.ci/scripts/$(ScriptFolder)/upload.sh +- script: chmod a+x ./.ci/scripts/$(ScriptFolder)/upload.sh && RELEASE_NAME=$(BuildName) ./.ci/scripts/$(ScriptFolder)/upload.sh displayName: 'Package Artifacts' - publish: artifacts artifact: 'yuzu-$(BuildName)-$(BuildSuffix)' diff --git a/.ci/templates/build-standard.yml b/.ci/templates/build-standard.yml index 9975f5c49..6cd209dbf 100644 --- a/.ci/templates/build-standard.yml +++ b/.ci/templates/build-standard.yml @@ -3,7 +3,7 @@ jobs: displayName: 'standard' pool: vmImage: ubuntu-latest - strategy: + strategy: maxParallel: 10 matrix: windows: diff --git a/.ci/templates/build-testing.yml b/.ci/templates/build-testing.yml index 101e52996..cb7736205 100644 --- a/.ci/templates/build-testing.yml +++ b/.ci/templates/build-testing.yml @@ -3,19 +3,21 @@ jobs: displayName: 'testing' pool: vmImage: ubuntu-latest - strategy: - maxParallel: 10 + strategy: + maxParallel: 5 matrix: windows: BuildSuffix: 'windows-testing' ScriptFolder: 'windows' steps: + - script: pip install requests urllib3 + displayName: 'Prepare Environment' - task: PythonScript@0 condition: eq(variables['Build.Reason'], 'PullRequest') displayName: 'Determine Testing Status' inputs: scriptSource: 'filePath' - scriptPath: '../scripts/merge/check-label-presence.py' + scriptPath: '.ci/scripts/merge/check-label-presence.py' arguments: '$(System.PullRequest.PullRequestNumber) create-testing-build' - ${{ if eq(variables.enabletesting, 'true') }}: - template: ./sync-source.yml @@ -27,4 +29,4 @@ jobs: matchLabel: 'testing-merge' - template: ./build-single.yml parameters: - artifactSource: 'false'
\ No newline at end of file + artifactSource: 'false' diff --git a/.ci/templates/release.yml b/.ci/templates/release.yml deleted file mode 100644 index 60bebd2aa..000000000 --- a/.ci/templates/release.yml +++ /dev/null @@ -1,29 +0,0 @@ -steps: - - task: DownloadPipelineArtifact@2 - displayName: 'Download Windows Release' - inputs: - artifactName: 'yuzu-$(BuildName)-windows-mingw' - buildType: 'current' - targetPath: '$(Build.ArtifactStagingDirectory)' - - task: DownloadPipelineArtifact@2 - displayName: 'Download Linux Release' - inputs: - artifactName: 'yuzu-$(BuildName)-linux' - buildType: 'current' - targetPath: '$(Build.ArtifactStagingDirectory)' - - task: DownloadPipelineArtifact@2 - displayName: 'Download Release Point' - inputs: - artifactName: 'yuzu-$(BuildName)-release-point' - buildType: 'current' - targetPath: '$(Build.ArtifactStagingDirectory)' - - script: echo '##vso[task.setvariable variable=tagcommit]' && cat $(Build.ArtifactStagingDirectory)/tag-commit.sha - displayName: 'Calculate Release Point' - - task: GitHubRelease@0 - inputs: - gitHubConnection: $(GitHubReleaseConnectionName) - repositoryName: '$(GitHubReleaseRepoName)' - action: 'create' - target: $(variables.tagcommit) - title: 'yuzu $(BuildName) #$(Build.BuildId)' - assets: '$(Build.ArtifactStagingDirectory)/*' @@ -2,6 +2,7 @@ yuzu emulator ============= [![Travis CI Build Status](https://travis-ci.org/yuzu-emu/yuzu.svg?branch=master)](https://travis-ci.org/yuzu-emu/yuzu) [![AppVeyor CI Build Status](https://ci.appveyor.com/api/projects/status/77k97svb2usreu68?svg=true)](https://ci.appveyor.com/project/bunnei/yuzu) +[![Azure Mainline CI Build Status](https://dev.azure.com/yuzu-emu/yuzu/_apis/build/status/yuzu%20mainline?branchName=master)](https://dev.azure.com/yuzu-emu/yuzu/) yuzu is an experimental open-source emulator for the Nintendo Switch from the creators of [Citra](https://citra-emu.org/). diff --git a/src/video_core/dma_pusher.cpp b/src/video_core/dma_pusher.cpp index 3175579cc..bd036cbe8 100644 --- a/src/video_core/dma_pusher.cpp +++ b/src/video_core/dma_pusher.cpp @@ -22,7 +22,7 @@ void DmaPusher::DispatchCalls() { MICROPROFILE_SCOPE(DispatchCalls); // On entering GPU code, assume all memory may be touched by the ARM core. - gpu.Maxwell3D().dirty_flags.OnMemoryWrite(); + gpu.Maxwell3D().dirty.OnMemoryWrite(); dma_pushbuffer_subindex = 0; diff --git a/src/video_core/engines/kepler_compute.cpp b/src/video_core/engines/kepler_compute.cpp index 7404a8163..08586d33c 100644 --- a/src/video_core/engines/kepler_compute.cpp +++ b/src/video_core/engines/kepler_compute.cpp @@ -37,7 +37,7 @@ void KeplerCompute::CallMethod(const GPU::MethodCall& method_call) { const bool is_last_call = method_call.IsLastCall(); upload_state.ProcessData(method_call.argument, is_last_call); if (is_last_call) { - system.GPU().Maxwell3D().dirty_flags.OnMemoryWrite(); + system.GPU().Maxwell3D().dirty.OnMemoryWrite(); } break; } @@ -50,13 +50,14 @@ void KeplerCompute::CallMethod(const GPU::MethodCall& method_call) { } void KeplerCompute::ProcessLaunch() { - const GPUVAddr launch_desc_loc = regs.launch_desc_loc.Address(); memory_manager.ReadBlockUnsafe(launch_desc_loc, &launch_description, LaunchParams::NUM_LAUNCH_PARAMETERS * sizeof(u32)); - const GPUVAddr code_loc = regs.code_loc.Address() + launch_description.program_start; - LOG_WARNING(HW_GPU, "Compute Kernel Execute at Address 0x{:016x}, STUBBED", code_loc); + const GPUVAddr code_addr = regs.code_loc.Address() + launch_description.program_start; + LOG_TRACE(HW_GPU, "Compute invocation launched at address 0x{:016x}", code_addr); + + rasterizer.DispatchCompute(code_addr); } } // namespace Tegra::Engines diff --git a/src/video_core/engines/kepler_memory.cpp b/src/video_core/engines/kepler_memory.cpp index 0561f676c..44279de00 100644 --- a/src/video_core/engines/kepler_memory.cpp +++ b/src/video_core/engines/kepler_memory.cpp @@ -34,7 +34,7 @@ void KeplerMemory::CallMethod(const GPU::MethodCall& method_call) { const bool is_last_call = method_call.IsLastCall(); upload_state.ProcessData(method_call.argument, is_last_call); if (is_last_call) { - system.GPU().Maxwell3D().dirty_flags.OnMemoryWrite(); + system.GPU().Maxwell3D().dirty.OnMemoryWrite(); } break; } diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp index 8755b8af4..74c46ec04 100644 --- a/src/video_core/engines/maxwell_3d.cpp +++ b/src/video_core/engines/maxwell_3d.cpp @@ -22,6 +22,7 @@ Maxwell3D::Maxwell3D(Core::System& system, VideoCore::RasterizerInterface& raste MemoryManager& memory_manager) : system{system}, rasterizer{rasterizer}, memory_manager{memory_manager}, macro_interpreter{*this}, upload_state{memory_manager, regs.upload} { + InitDirtySettings(); InitializeRegisterDefaults(); } @@ -69,6 +70,10 @@ void Maxwell3D::InitializeRegisterDefaults() { regs.stencil_back_func_mask = 0xFFFFFFFF; regs.stencil_back_mask = 0xFFFFFFFF; + regs.depth_test_func = Regs::ComparisonOp::Always; + regs.cull.front_face = Regs::Cull::FrontFace::CounterClockWise; + regs.cull.cull_face = Regs::Cull::CullFace::Back; + // TODO(Rodrigo): Most games do not set a point size. I think this is a case of a // register carrying a default value. Assume it's OpenGL's default (1). regs.point_size = 1.0f; @@ -86,6 +91,159 @@ void Maxwell3D::InitializeRegisterDefaults() { regs.rt_separate_frag_data = 1; } +#define DIRTY_REGS_POS(field_name) (offsetof(Maxwell3D::DirtyRegs, field_name)) + +void Maxwell3D::InitDirtySettings() { + const auto set_block = [this](const u32 start, const u32 range, const u8 position) { + const auto start_itr = dirty_pointers.begin() + start; + const auto end_itr = start_itr + range; + std::fill(start_itr, end_itr, position); + }; + dirty.regs.fill(true); + + // Init Render Targets + constexpr u32 registers_per_rt = sizeof(regs.rt[0]) / sizeof(u32); + constexpr u32 rt_start_reg = MAXWELL3D_REG_INDEX(rt); + constexpr u32 rt_end_reg = rt_start_reg + registers_per_rt * 8; + u32 rt_dirty_reg = DIRTY_REGS_POS(render_target); + for (u32 rt_reg = rt_start_reg; rt_reg < rt_end_reg; rt_reg += registers_per_rt) { + set_block(rt_reg, registers_per_rt, rt_dirty_reg); + rt_dirty_reg++; + } + constexpr u32 depth_buffer_flag = DIRTY_REGS_POS(depth_buffer); + dirty_pointers[MAXWELL3D_REG_INDEX(zeta_enable)] = depth_buffer_flag; + dirty_pointers[MAXWELL3D_REG_INDEX(zeta_width)] = depth_buffer_flag; + dirty_pointers[MAXWELL3D_REG_INDEX(zeta_height)] = depth_buffer_flag; + constexpr u32 registers_in_zeta = sizeof(regs.zeta) / sizeof(u32); + constexpr u32 zeta_reg = MAXWELL3D_REG_INDEX(zeta); + set_block(zeta_reg, registers_in_zeta, depth_buffer_flag); + + // Init Vertex Arrays + constexpr u32 vertex_array_start = MAXWELL3D_REG_INDEX(vertex_array); + constexpr u32 vertex_array_size = sizeof(regs.vertex_array[0]) / sizeof(u32); + constexpr u32 vertex_array_end = vertex_array_start + vertex_array_size * Regs::NumVertexArrays; + u32 va_reg = DIRTY_REGS_POS(vertex_array); + u32 vi_reg = DIRTY_REGS_POS(vertex_instance); + for (u32 vertex_reg = vertex_array_start; vertex_reg < vertex_array_end; + vertex_reg += vertex_array_size) { + set_block(vertex_reg, 3, va_reg); + // The divisor concerns vertex array instances + dirty_pointers[vertex_reg + 3] = vi_reg; + va_reg++; + vi_reg++; + } + constexpr u32 vertex_limit_start = MAXWELL3D_REG_INDEX(vertex_array_limit); + constexpr u32 vertex_limit_size = sizeof(regs.vertex_array_limit[0]) / sizeof(u32); + constexpr u32 vertex_limit_end = vertex_limit_start + vertex_limit_size * Regs::NumVertexArrays; + va_reg = DIRTY_REGS_POS(vertex_array); + for (u32 vertex_reg = vertex_limit_start; vertex_reg < vertex_limit_end; + vertex_reg += vertex_limit_size) { + set_block(vertex_reg, vertex_limit_size, va_reg); + va_reg++; + } + constexpr u32 vertex_instance_start = MAXWELL3D_REG_INDEX(instanced_arrays); + constexpr u32 vertex_instance_size = + sizeof(regs.instanced_arrays.is_instanced[0]) / sizeof(u32); + constexpr u32 vertex_instance_end = + vertex_instance_start + vertex_instance_size * Regs::NumVertexArrays; + vi_reg = DIRTY_REGS_POS(vertex_instance); + for (u32 vertex_reg = vertex_instance_start; vertex_reg < vertex_instance_end; + vertex_reg += vertex_instance_size) { + set_block(vertex_reg, vertex_instance_size, vi_reg); + vi_reg++; + } + set_block(MAXWELL3D_REG_INDEX(vertex_attrib_format), regs.vertex_attrib_format.size(), + DIRTY_REGS_POS(vertex_attrib_format)); + + // Init Shaders + constexpr u32 shader_registers_count = + sizeof(regs.shader_config[0]) * Regs::MaxShaderProgram / sizeof(u32); + set_block(MAXWELL3D_REG_INDEX(shader_config[0]), shader_registers_count, + DIRTY_REGS_POS(shaders)); + + // State + + // Viewport + constexpr u32 viewport_dirty_reg = DIRTY_REGS_POS(viewport); + constexpr u32 viewport_start = MAXWELL3D_REG_INDEX(viewports); + constexpr u32 viewport_size = sizeof(regs.viewports) / sizeof(u32); + set_block(viewport_start, viewport_size, viewport_dirty_reg); + constexpr u32 view_volume_start = MAXWELL3D_REG_INDEX(view_volume_clip_control); + constexpr u32 view_volume_size = sizeof(regs.view_volume_clip_control) / sizeof(u32); + set_block(view_volume_start, view_volume_size, viewport_dirty_reg); + + // Viewport transformation + constexpr u32 viewport_trans_start = MAXWELL3D_REG_INDEX(viewport_transform); + constexpr u32 viewport_trans_size = sizeof(regs.viewport_transform) / sizeof(u32); + set_block(viewport_trans_start, viewport_trans_size, DIRTY_REGS_POS(viewport_transform)); + + // Cullmode + constexpr u32 cull_mode_start = MAXWELL3D_REG_INDEX(cull); + constexpr u32 cull_mode_size = sizeof(regs.cull) / sizeof(u32); + set_block(cull_mode_start, cull_mode_size, DIRTY_REGS_POS(cull_mode)); + + // Screen y control + dirty_pointers[MAXWELL3D_REG_INDEX(screen_y_control)] = DIRTY_REGS_POS(screen_y_control); + + // Primitive Restart + constexpr u32 primitive_restart_start = MAXWELL3D_REG_INDEX(primitive_restart); + constexpr u32 primitive_restart_size = sizeof(regs.primitive_restart) / sizeof(u32); + set_block(primitive_restart_start, primitive_restart_size, DIRTY_REGS_POS(primitive_restart)); + + // Depth Test + constexpr u32 depth_test_dirty_reg = DIRTY_REGS_POS(depth_test); + dirty_pointers[MAXWELL3D_REG_INDEX(depth_test_enable)] = depth_test_dirty_reg; + dirty_pointers[MAXWELL3D_REG_INDEX(depth_write_enabled)] = depth_test_dirty_reg; + dirty_pointers[MAXWELL3D_REG_INDEX(depth_test_func)] = depth_test_dirty_reg; + + // Stencil Test + constexpr u32 stencil_test_dirty_reg = DIRTY_REGS_POS(stencil_test); + dirty_pointers[MAXWELL3D_REG_INDEX(stencil_enable)] = stencil_test_dirty_reg; + dirty_pointers[MAXWELL3D_REG_INDEX(stencil_front_func_func)] = stencil_test_dirty_reg; + dirty_pointers[MAXWELL3D_REG_INDEX(stencil_front_func_ref)] = stencil_test_dirty_reg; + dirty_pointers[MAXWELL3D_REG_INDEX(stencil_front_func_mask)] = stencil_test_dirty_reg; + dirty_pointers[MAXWELL3D_REG_INDEX(stencil_front_op_fail)] = stencil_test_dirty_reg; + dirty_pointers[MAXWELL3D_REG_INDEX(stencil_front_op_zfail)] = stencil_test_dirty_reg; + dirty_pointers[MAXWELL3D_REG_INDEX(stencil_front_op_zpass)] = stencil_test_dirty_reg; + dirty_pointers[MAXWELL3D_REG_INDEX(stencil_front_mask)] = stencil_test_dirty_reg; + dirty_pointers[MAXWELL3D_REG_INDEX(stencil_two_side_enable)] = stencil_test_dirty_reg; + dirty_pointers[MAXWELL3D_REG_INDEX(stencil_back_func_func)] = stencil_test_dirty_reg; + dirty_pointers[MAXWELL3D_REG_INDEX(stencil_back_func_ref)] = stencil_test_dirty_reg; + dirty_pointers[MAXWELL3D_REG_INDEX(stencil_back_func_mask)] = stencil_test_dirty_reg; + dirty_pointers[MAXWELL3D_REG_INDEX(stencil_back_op_fail)] = stencil_test_dirty_reg; + dirty_pointers[MAXWELL3D_REG_INDEX(stencil_back_op_zfail)] = stencil_test_dirty_reg; + dirty_pointers[MAXWELL3D_REG_INDEX(stencil_back_op_zpass)] = stencil_test_dirty_reg; + dirty_pointers[MAXWELL3D_REG_INDEX(stencil_back_mask)] = stencil_test_dirty_reg; + + // Color Mask + constexpr u32 color_mask_dirty_reg = DIRTY_REGS_POS(color_mask); + dirty_pointers[MAXWELL3D_REG_INDEX(color_mask_common)] = color_mask_dirty_reg; + set_block(MAXWELL3D_REG_INDEX(color_mask), sizeof(regs.color_mask) / sizeof(u32), + color_mask_dirty_reg); + // Blend State + constexpr u32 blend_state_dirty_reg = DIRTY_REGS_POS(blend_state); + set_block(MAXWELL3D_REG_INDEX(blend_color), sizeof(regs.blend_color) / sizeof(u32), + blend_state_dirty_reg); + dirty_pointers[MAXWELL3D_REG_INDEX(independent_blend_enable)] = blend_state_dirty_reg; + set_block(MAXWELL3D_REG_INDEX(blend), sizeof(regs.blend) / sizeof(u32), blend_state_dirty_reg); + set_block(MAXWELL3D_REG_INDEX(independent_blend), sizeof(regs.independent_blend) / sizeof(u32), + blend_state_dirty_reg); + + // Scissor State + constexpr u32 scissor_test_dirty_reg = DIRTY_REGS_POS(scissor_test); + set_block(MAXWELL3D_REG_INDEX(scissor_test), sizeof(regs.scissor_test) / sizeof(u32), + scissor_test_dirty_reg); + + // Polygon Offset + constexpr u32 polygon_offset_dirty_reg = DIRTY_REGS_POS(polygon_offset); + dirty_pointers[MAXWELL3D_REG_INDEX(polygon_offset_fill_enable)] = polygon_offset_dirty_reg; + dirty_pointers[MAXWELL3D_REG_INDEX(polygon_offset_line_enable)] = polygon_offset_dirty_reg; + dirty_pointers[MAXWELL3D_REG_INDEX(polygon_offset_point_enable)] = polygon_offset_dirty_reg; + dirty_pointers[MAXWELL3D_REG_INDEX(polygon_offset_units)] = polygon_offset_dirty_reg; + dirty_pointers[MAXWELL3D_REG_INDEX(polygon_offset_factor)] = polygon_offset_dirty_reg; + dirty_pointers[MAXWELL3D_REG_INDEX(polygon_offset_clamp)] = polygon_offset_dirty_reg; +} + void Maxwell3D::CallMacroMethod(u32 method, std::vector<u32> parameters) { // Reset the current macro. executing_macro = 0; @@ -108,6 +266,14 @@ void Maxwell3D::CallMethod(const GPU::MethodCall& method_call) { const u32 method = method_call.method; + if (method == cb_data_state.current) { + regs.reg_array[method] = method_call.argument; + ProcessCBData(method_call.argument); + return; + } else if (cb_data_state.current != null_cb_data) { + FinishCBData(); + } + // It is an error to write to a register other than the current macro's ARG register before it // has finished execution. if (executing_macro != 0) { @@ -143,49 +309,19 @@ void Maxwell3D::CallMethod(const GPU::MethodCall& method_call) { if (regs.reg_array[method] != method_call.argument) { regs.reg_array[method] = method_call.argument; - // Color buffers - constexpr u32 first_rt_reg = MAXWELL3D_REG_INDEX(rt); - constexpr u32 registers_per_rt = sizeof(regs.rt[0]) / sizeof(u32); - if (method >= first_rt_reg && - method < first_rt_reg + registers_per_rt * Regs::NumRenderTargets) { - const std::size_t rt_index = (method - first_rt_reg) / registers_per_rt; - dirty_flags.color_buffer.set(rt_index); - } - - // Zeta buffer - constexpr u32 registers_in_zeta = sizeof(regs.zeta) / sizeof(u32); - if (method == MAXWELL3D_REG_INDEX(zeta_enable) || - method == MAXWELL3D_REG_INDEX(zeta_width) || - method == MAXWELL3D_REG_INDEX(zeta_height) || - (method >= MAXWELL3D_REG_INDEX(zeta) && - method < MAXWELL3D_REG_INDEX(zeta) + registers_in_zeta)) { - dirty_flags.zeta_buffer = true; - } - - // Shader - constexpr u32 shader_registers_count = - sizeof(regs.shader_config[0]) * Regs::MaxShaderProgram / sizeof(u32); - if (method >= MAXWELL3D_REG_INDEX(shader_config[0]) && - method < MAXWELL3D_REG_INDEX(shader_config[0]) + shader_registers_count) { - dirty_flags.shaders = true; - } - - // Vertex format - if (method >= MAXWELL3D_REG_INDEX(vertex_attrib_format) && - method < MAXWELL3D_REG_INDEX(vertex_attrib_format) + regs.vertex_attrib_format.size()) { - dirty_flags.vertex_attrib_format = true; - } - - // Vertex buffer - if (method >= MAXWELL3D_REG_INDEX(vertex_array) && - method < MAXWELL3D_REG_INDEX(vertex_array) + 4 * Regs::NumVertexArrays) { - dirty_flags.vertex_array.set((method - MAXWELL3D_REG_INDEX(vertex_array)) >> 2); - } else if (method >= MAXWELL3D_REG_INDEX(vertex_array_limit) && - method < MAXWELL3D_REG_INDEX(vertex_array_limit) + 2 * Regs::NumVertexArrays) { - dirty_flags.vertex_array.set((method - MAXWELL3D_REG_INDEX(vertex_array_limit)) >> 1); - } else if (method >= MAXWELL3D_REG_INDEX(instanced_arrays) && - method < MAXWELL3D_REG_INDEX(instanced_arrays) + Regs::NumVertexArrays) { - dirty_flags.vertex_array.set(method - MAXWELL3D_REG_INDEX(instanced_arrays)); + const std::size_t dirty_reg = dirty_pointers[method]; + if (dirty_reg) { + dirty.regs[dirty_reg] = true; + if (dirty_reg >= DIRTY_REGS_POS(vertex_array) && + dirty_reg < DIRTY_REGS_POS(vertex_array_buffers)) { + dirty.vertex_array_buffers = true; + } else if (dirty_reg >= DIRTY_REGS_POS(vertex_instance) && + dirty_reg < DIRTY_REGS_POS(vertex_instances)) { + dirty.vertex_instances = true; + } else if (dirty_reg >= DIRTY_REGS_POS(render_target) && + dirty_reg < DIRTY_REGS_POS(render_settings)) { + dirty.render_settings = true; + } } } @@ -214,7 +350,7 @@ void Maxwell3D::CallMethod(const GPU::MethodCall& method_call) { case MAXWELL3D_REG_INDEX(const_buffer.cb_data[13]): case MAXWELL3D_REG_INDEX(const_buffer.cb_data[14]): case MAXWELL3D_REG_INDEX(const_buffer.cb_data[15]): { - ProcessCBData(method_call.argument); + StartCBData(method); break; } case MAXWELL3D_REG_INDEX(cb_bind[0].raw_config): { @@ -249,6 +385,10 @@ void Maxwell3D::CallMethod(const GPU::MethodCall& method_call) { ProcessQueryGet(); break; } + case MAXWELL3D_REG_INDEX(condition.mode): { + ProcessQueryCondition(); + break; + } case MAXWELL3D_REG_INDEX(sync_info): { ProcessSyncPoint(); break; @@ -261,7 +401,7 @@ void Maxwell3D::CallMethod(const GPU::MethodCall& method_call) { const bool is_last_call = method_call.IsLastCall(); upload_state.ProcessData(method_call.argument, is_last_call); if (is_last_call) { - dirty_flags.OnMemoryWrite(); + dirty.OnMemoryWrite(); } break; } @@ -302,6 +442,7 @@ void Maxwell3D::ProcessQueryGet() { result = regs.query.query_sequence; break; default: + result = 1; UNIMPLEMENTED_MSG("Unimplemented query select type {}", static_cast<u32>(regs.query.query_get.select.Value())); } @@ -333,7 +474,6 @@ void Maxwell3D::ProcessQueryGet() { query_result.timestamp = system.CoreTiming().GetTicks(); memory_manager.WriteBlock(sequence_address, &query_result, sizeof(query_result)); } - dirty_flags.OnMemoryWrite(); break; } default: @@ -342,6 +482,45 @@ void Maxwell3D::ProcessQueryGet() { } } +void Maxwell3D::ProcessQueryCondition() { + const GPUVAddr condition_address{regs.condition.Address()}; + switch (regs.condition.mode) { + case Regs::ConditionMode::Always: { + execute_on = true; + break; + } + case Regs::ConditionMode::Never: { + execute_on = false; + break; + } + case Regs::ConditionMode::ResNonZero: { + Regs::QueryCompare cmp; + memory_manager.ReadBlockUnsafe(condition_address, &cmp, sizeof(cmp)); + execute_on = cmp.initial_sequence != 0U && cmp.initial_mode != 0U; + break; + } + case Regs::ConditionMode::Equal: { + Regs::QueryCompare cmp; + memory_manager.ReadBlockUnsafe(condition_address, &cmp, sizeof(cmp)); + execute_on = + cmp.initial_sequence == cmp.current_sequence && cmp.initial_mode == cmp.current_mode; + break; + } + case Regs::ConditionMode::NotEqual: { + Regs::QueryCompare cmp; + memory_manager.ReadBlockUnsafe(condition_address, &cmp, sizeof(cmp)); + execute_on = + cmp.initial_sequence != cmp.current_sequence || cmp.initial_mode != cmp.current_mode; + break; + } + default: { + UNIMPLEMENTED_MSG("Uninplemented Condition Mode!"); + execute_on = true; + break; + } + } +} + void Maxwell3D::ProcessSyncPoint() { const u32 sync_point = regs.sync_info.sync_point.Value(); const u32 increment = regs.sync_info.increment.Value(); @@ -405,23 +584,39 @@ void Maxwell3D::ProcessCBBind(Regs::ShaderStage stage) { } void Maxwell3D::ProcessCBData(u32 value) { + const u32 id = cb_data_state.id; + cb_data_state.buffer[id][cb_data_state.counter] = value; + // Increment the current buffer position. + regs.const_buffer.cb_pos = regs.const_buffer.cb_pos + 4; + cb_data_state.counter++; +} + +void Maxwell3D::StartCBData(u32 method) { + constexpr u32 first_cb_data = MAXWELL3D_REG_INDEX(const_buffer.cb_data[0]); + cb_data_state.start_pos = regs.const_buffer.cb_pos; + cb_data_state.id = method - first_cb_data; + cb_data_state.current = method; + cb_data_state.counter = 0; + ProcessCBData(regs.const_buffer.cb_data[cb_data_state.id]); +} + +void Maxwell3D::FinishCBData() { // Write the input value to the current const buffer at the current position. const GPUVAddr buffer_address = regs.const_buffer.BufferAddress(); ASSERT(buffer_address != 0); // Don't allow writing past the end of the buffer. - ASSERT(regs.const_buffer.cb_pos + sizeof(u32) <= regs.const_buffer.cb_size); - - const GPUVAddr address{buffer_address + regs.const_buffer.cb_pos}; + ASSERT(regs.const_buffer.cb_pos <= regs.const_buffer.cb_size); - u8* ptr{memory_manager.GetPointer(address)}; - rasterizer.InvalidateRegion(ToCacheAddr(ptr), sizeof(u32)); - memory_manager.Write<u32>(address, value); + const GPUVAddr address{buffer_address + cb_data_state.start_pos}; + const std::size_t size = regs.const_buffer.cb_pos - cb_data_state.start_pos; - dirty_flags.OnMemoryWrite(); + const u32 id = cb_data_state.id; + memory_manager.WriteBlock(address, cb_data_state.buffer[id].data(), size); + dirty.OnMemoryWrite(); - // Increment the current buffer position. - regs.const_buffer.cb_pos = regs.const_buffer.cb_pos + 4; + cb_data_state.id = null_cb_data; + cb_data_state.current = null_cb_data; } Texture::TICEntry Maxwell3D::GetTICEntry(u32 tic_index) const { diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h index 8d15c8a48..1ee982b76 100644 --- a/src/video_core/engines/maxwell_3d.h +++ b/src/video_core/engines/maxwell_3d.h @@ -90,6 +90,20 @@ public: enum class QuerySelect : u32 { Zero = 0, + TimeElapsed = 2, + TransformFeedbackPrimitivesGenerated = 11, + PrimitivesGenerated = 18, + SamplesPassed = 21, + TransformFeedbackUnknown = 26, + }; + + struct QueryCompare { + u32 initial_sequence; + u32 initial_mode; + u32 unknown1; + u32 unknown2; + u32 current_sequence; + u32 current_mode; }; enum class QuerySyncCondition : u32 { @@ -97,6 +111,14 @@ public: GreaterThan = 1, }; + enum class ConditionMode : u32 { + Never = 0, + Always = 1, + ResNonZero = 2, + Equal = 3, + NotEqual = 4, + }; + enum class ShaderProgram : u32 { VertexA = 0, VertexB = 1, @@ -815,7 +837,18 @@ public: BitField<4, 1, u32> alpha_to_one; } multisample_control; - INSERT_PADDING_WORDS(0x7); + INSERT_PADDING_WORDS(0x4); + + struct { + u32 address_high; + u32 address_low; + ConditionMode mode; + + GPUVAddr Address() const { + return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) | + address_low); + } + } condition; struct { u32 tsc_address_high; @@ -1124,23 +1157,77 @@ public: State state{}; - struct DirtyFlags { - std::bitset<8> color_buffer{0xFF}; - std::bitset<32> vertex_array{0xFFFFFFFF}; + struct DirtyRegs { + static constexpr std::size_t NUM_REGS = 256; + union { + struct { + bool null_dirty; + + // Vertex Attributes + bool vertex_attrib_format; + + // Vertex Arrays + std::array<bool, 32> vertex_array; + + bool vertex_array_buffers; + + // Vertex Instances + std::array<bool, 32> vertex_instance; - bool vertex_attrib_format = true; - bool zeta_buffer = true; - bool shaders = true; + bool vertex_instances; + + // Render Targets + std::array<bool, 8> render_target; + bool depth_buffer; + + bool render_settings; + + // Shaders + bool shaders; + + // Rasterizer State + bool viewport; + bool clip_coefficient; + bool cull_mode; + bool primitive_restart; + bool depth_test; + bool stencil_test; + bool blend_state; + bool scissor_test; + bool transform_feedback; + bool color_mask; + bool polygon_offset; + + // Complementary + bool viewport_transform; + bool screen_y_control; + + bool memory_general; + }; + std::array<bool, NUM_REGS> regs; + }; + + void ResetVertexArrays() { + vertex_array.fill(true); + vertex_array_buffers = true; + } + + void ResetRenderTargets() { + depth_buffer = true; + render_target.fill(true); + render_settings = true; + } void OnMemoryWrite() { - zeta_buffer = true; shaders = true; - color_buffer.set(); - vertex_array.set(); + memory_general = true; + ResetRenderTargets(); + ResetVertexArrays(); } - }; - DirtyFlags dirty_flags; + } dirty{}; + + std::array<u8, Regs::NUM_REGS> dirty_pointers{}; /// Reads a register value located at the input method address u32 GetRegisterValue(u32 method) const; @@ -1169,6 +1256,10 @@ public: return macro_memory; } + bool ShouldExecute() const { + return execute_on; + } + private: void InitializeRegisterDefaults(); @@ -1192,14 +1283,27 @@ private: /// Interpreter for the macro codes uploaded to the GPU. MacroInterpreter macro_interpreter; + static constexpr u32 null_cb_data = 0xFFFFFFFF; + struct { + std::array<std::array<u32, 0x4000>, 16> buffer; + u32 current{null_cb_data}; + u32 id{null_cb_data}; + u32 start_pos{}; + u32 counter{}; + } cb_data_state; + Upload::State upload_state; + bool execute_on{true}; + /// Retrieves information about a specific TIC entry from the TIC buffer. Texture::TICEntry GetTICEntry(u32 tic_index) const; /// Retrieves information about a specific TSC entry from the TSC buffer. Texture::TSCEntry GetTSCEntry(u32 tsc_index) const; + void InitDirtySettings(); + /** * Call a macro on this engine. * @param method Method to call @@ -1219,11 +1323,16 @@ private: /// Handles a write to the QUERY_GET register. void ProcessQueryGet(); + // Handles Conditional Rendering + void ProcessQueryCondition(); + /// Handles writes to syncing register. void ProcessSyncPoint(); /// Handles a write to the CB_DATA[i] register. + void StartCBData(u32 method); void ProcessCBData(u32 value); + void FinishCBData(); /// Handles a write to the CB_BIND register. void ProcessCBBind(Regs::ShaderStage stage); @@ -1290,6 +1399,7 @@ ASSERT_REG_POSITION(clip_distance_enabled, 0x544); ASSERT_REG_POSITION(point_size, 0x546); ASSERT_REG_POSITION(zeta_enable, 0x54E); ASSERT_REG_POSITION(multisample_control, 0x54F); +ASSERT_REG_POSITION(condition, 0x554); ASSERT_REG_POSITION(tsc, 0x557); ASSERT_REG_POSITION(polygon_offset_factor, 0x55b); ASSERT_REG_POSITION(tic, 0x55D); diff --git a/src/video_core/engines/maxwell_dma.cpp b/src/video_core/engines/maxwell_dma.cpp index afb9578d0..a28c04473 100644 --- a/src/video_core/engines/maxwell_dma.cpp +++ b/src/video_core/engines/maxwell_dma.cpp @@ -38,7 +38,7 @@ void MaxwellDMA::CallMethod(const GPU::MethodCall& method_call) { } void MaxwellDMA::HandleCopy() { - LOG_WARNING(HW_GPU, "Requested a DMA copy"); + LOG_TRACE(HW_GPU, "Requested a DMA copy"); const GPUVAddr source = regs.src_address.Address(); const GPUVAddr dest = regs.dst_address.Address(); @@ -58,7 +58,7 @@ void MaxwellDMA::HandleCopy() { } // All copies here update the main memory, so mark all rasterizer states as invalid. - system.GPU().Maxwell3D().dirty_flags.OnMemoryWrite(); + system.GPU().Maxwell3D().dirty.OnMemoryWrite(); if (regs.exec.is_dst_linear && regs.exec.is_src_linear) { // When the enable_2d bit is disabled, the copy is performed as if we were copying a 1D diff --git a/src/video_core/engines/shader_bytecode.h b/src/video_core/engines/shader_bytecode.h index 79d469b88..8520a0143 100644 --- a/src/video_core/engines/shader_bytecode.h +++ b/src/video_core/engines/shader_bytecode.h @@ -931,8 +931,6 @@ union Instruction { } csetp; union { - BitField<35, 4, PredCondition> cond; - BitField<49, 1, u64> h_and; BitField<6, 1, u64> ftz; BitField<45, 2, PredOperation> op; BitField<3, 3, u64> pred3; @@ -940,9 +938,21 @@ union Instruction { BitField<43, 1, u64> negate_a; BitField<44, 1, u64> abs_a; BitField<47, 2, HalfType> type_a; - BitField<31, 1, u64> negate_b; - BitField<30, 1, u64> abs_b; - BitField<28, 2, HalfType> type_b; + union { + BitField<35, 4, PredCondition> cond; + BitField<49, 1, u64> h_and; + BitField<31, 1, u64> negate_b; + BitField<30, 1, u64> abs_b; + BitField<28, 2, HalfType> type_b; + } reg; + union { + BitField<56, 1, u64> negate_b; + BitField<54, 1, u64> abs_b; + } cbuf; + union { + BitField<49, 4, PredCondition> cond; + BitField<53, 1, u64> h_and; + } cbuf_and_imm; BitField<42, 1, u64> neg_pred; BitField<39, 3, u64> pred39; } hsetp2; @@ -1548,7 +1558,9 @@ public: HFMA2_RC, HFMA2_RR, HFMA2_IMM_R, + HSETP2_C, HSETP2_R, + HSETP2_IMM, HSET2_R, POPC_C, POPC_R, @@ -1831,7 +1843,9 @@ private: INST("01100---1-------", Id::HFMA2_RC, Type::Hfma2, "HFMA2_RC"), INST("0101110100000---", Id::HFMA2_RR, Type::Hfma2, "HFMA2_RR"), INST("01110---0-------", Id::HFMA2_IMM_R, Type::Hfma2, "HFMA2_R_IMM"), - INST("0101110100100---", Id::HSETP2_R, Type::HalfSetPredicate, "HSETP_R"), + INST("0111111-1-------", Id::HSETP2_C, Type::HalfSetPredicate, "HSETP2_C"), + INST("0101110100100---", Id::HSETP2_R, Type::HalfSetPredicate, "HSETP2_R"), + INST("0111111-0-------", Id::HSETP2_IMM, Type::HalfSetPredicate, "HSETP2_IMM"), INST("0101110100011---", Id::HSET2_R, Type::HalfSet, "HSET2_R"), INST("0101000010000---", Id::MUFU, Type::Arithmetic, "MUFU"), INST("0100110010010---", Id::RRO_C, Type::Arithmetic, "RRO_C"), diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp index 1b4975498..21007d8b2 100644 --- a/src/video_core/gpu.cpp +++ b/src/video_core/gpu.cpp @@ -50,6 +50,14 @@ const Engines::Maxwell3D& GPU::Maxwell3D() const { return *maxwell_3d; } +Engines::KeplerCompute& GPU::KeplerCompute() { + return *kepler_compute; +} + +const Engines::KeplerCompute& GPU::KeplerCompute() const { + return *kepler_compute; +} + MemoryManager& GPU::MemoryManager() { return *memory_manager; } @@ -143,12 +151,12 @@ enum class BufferMethods { NotifyIntr = 0x8, WrcacheFlush = 0x9, Unk28 = 0xA, - Unk2c = 0xB, + UnkCacheFlush = 0xB, RefCnt = 0x14, SemaphoreAcquire = 0x1A, SemaphoreRelease = 0x1B, - Unk70 = 0x1C, - Unk74 = 0x1D, + FenceValue = 0x1C, + FenceAction = 0x1D, Unk78 = 0x1E, Unk7c = 0x1F, Yield = 0x20, @@ -194,6 +202,10 @@ void GPU::CallPullerMethod(const MethodCall& method_call) { case BufferMethods::SemaphoreAddressLow: case BufferMethods::SemaphoreSequence: case BufferMethods::RefCnt: + case BufferMethods::UnkCacheFlush: + case BufferMethods::WrcacheFlush: + case BufferMethods::FenceValue: + case BufferMethods::FenceAction: break; case BufferMethods::SemaphoreTrigger: { ProcessSemaphoreTriggerMethod(); @@ -204,21 +216,11 @@ void GPU::CallPullerMethod(const MethodCall& method_call) { LOG_ERROR(HW_GPU, "Special puller engine method NotifyIntr not implemented"); break; } - case BufferMethods::WrcacheFlush: { - // TODO(Kmather73): Research and implement this method. - LOG_ERROR(HW_GPU, "Special puller engine method WrcacheFlush not implemented"); - break; - } case BufferMethods::Unk28: { // TODO(Kmather73): Research and implement this method. LOG_ERROR(HW_GPU, "Special puller engine method Unk28 not implemented"); break; } - case BufferMethods::Unk2c: { - // TODO(Kmather73): Research and implement this method. - LOG_ERROR(HW_GPU, "Special puller engine method Unk2c not implemented"); - break; - } case BufferMethods::SemaphoreAcquire: { ProcessSemaphoreAcquire(); break; diff --git a/src/video_core/gpu.h b/src/video_core/gpu.h index fe6628923..0055e5326 100644 --- a/src/video_core/gpu.h +++ b/src/video_core/gpu.h @@ -155,6 +155,12 @@ public: /// Returns a const reference to the Maxwell3D GPU engine. const Engines::Maxwell3D& Maxwell3D() const; + /// Returns a reference to the KeplerCompute GPU engine. + Engines::KeplerCompute& KeplerCompute(); + + /// Returns a reference to the KeplerCompute GPU engine. + const Engines::KeplerCompute& KeplerCompute() const; + /// Returns a reference to the GPU memory manager. Tegra::MemoryManager& MemoryManager(); @@ -194,7 +200,12 @@ public: u32 semaphore_acquire; u32 semaphore_release; - INSERT_PADDING_WORDS(0xE4); + u32 fence_value; + union { + BitField<4, 4, u32> operation; + BitField<8, 8, u32> id; + } fence_action; + INSERT_PADDING_WORDS(0xE2); // Puller state u32 acquire_mode; @@ -274,6 +285,8 @@ ASSERT_REG_POSITION(semaphore_trigger, 0x7); ASSERT_REG_POSITION(reference_count, 0x14); ASSERT_REG_POSITION(semaphore_acquire, 0x1A); ASSERT_REG_POSITION(semaphore_release, 0x1B); +ASSERT_REG_POSITION(fence_value, 0x1C); +ASSERT_REG_POSITION(fence_action, 0x1D); ASSERT_REG_POSITION(acquire_mode, 0x100); ASSERT_REG_POSITION(acquire_source, 0x101); diff --git a/src/video_core/rasterizer_interface.h b/src/video_core/rasterizer_interface.h index 2b7367568..9881df0d5 100644 --- a/src/video_core/rasterizer_interface.h +++ b/src/video_core/rasterizer_interface.h @@ -34,6 +34,9 @@ public: /// Clear the current framebuffer virtual void Clear() = 0; + /// Dispatches a compute shader invocation + virtual void DispatchCompute(GPUVAddr code_addr) = 0; + /// Notify rasterizer that all caches should be flushed to Switch memory virtual void FlushAll() = 0; diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp index 0bb5c068c..c28ae795c 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp @@ -4,6 +4,7 @@ #include <algorithm> #include <array> +#include <bitset> #include <memory> #include <string> #include <string_view> @@ -19,6 +20,7 @@ #include "core/core.h" #include "core/hle/kernel/process.h" #include "core/settings.h" +#include "video_core/engines/kepler_compute.h" #include "video_core/engines/maxwell_3d.h" #include "video_core/memory_manager.h" #include "video_core/renderer_opengl/gl_rasterizer.h" @@ -105,6 +107,7 @@ RasterizerOpenGL::RasterizerOpenGL(Core::System& system, Core::Frontend::EmuWind shader_program_manager = std::make_unique<GLShader::ProgramManager>(); state.draw.shader_program = 0; state.Apply(); + clear_framebuffer.Create(); LOG_DEBUG(Render_OpenGL, "Sync fixed function OpenGL state here"); CheckExtensions(); @@ -124,10 +127,10 @@ GLuint RasterizerOpenGL::SetupVertexFormat() { auto& gpu = system.GPU().Maxwell3D(); const auto& regs = gpu.regs; - if (!gpu.dirty_flags.vertex_attrib_format) { + if (!gpu.dirty.vertex_attrib_format) { return state.draw.vertex_array; } - gpu.dirty_flags.vertex_attrib_format = false; + gpu.dirty.vertex_attrib_format = false; MICROPROFILE_SCOPE(OpenGL_VAO); @@ -181,7 +184,7 @@ GLuint RasterizerOpenGL::SetupVertexFormat() { } // Rebinding the VAO invalidates the vertex buffer bindings. - gpu.dirty_flags.vertex_array.set(); + gpu.dirty.ResetVertexArrays(); state.draw.vertex_array = vao_entry.handle; return vao_entry.handle; @@ -189,17 +192,20 @@ GLuint RasterizerOpenGL::SetupVertexFormat() { void RasterizerOpenGL::SetupVertexBuffer(GLuint vao) { auto& gpu = system.GPU().Maxwell3D(); - const auto& regs = gpu.regs; - - if (gpu.dirty_flags.vertex_array.none()) + if (!gpu.dirty.vertex_array_buffers) return; + gpu.dirty.vertex_array_buffers = false; + + const auto& regs = gpu.regs; MICROPROFILE_SCOPE(OpenGL_VB); // Upload all guest vertex arrays sequentially to our buffer for (u32 index = 0; index < Maxwell::NumVertexArrays; ++index) { - if (!gpu.dirty_flags.vertex_array[index]) + if (!gpu.dirty.vertex_array[index]) continue; + gpu.dirty.vertex_array[index] = false; + gpu.dirty.vertex_instance[index] = false; const auto& vertex_array = regs.vertex_array[index]; if (!vertex_array.IsEnabled()) @@ -224,8 +230,32 @@ void RasterizerOpenGL::SetupVertexBuffer(GLuint vao) { glVertexArrayBindingDivisor(vao, index, 0); } } +} + +void RasterizerOpenGL::SetupVertexInstances(GLuint vao) { + auto& gpu = system.GPU().Maxwell3D(); + + if (!gpu.dirty.vertex_instances) + return; + gpu.dirty.vertex_instances = false; + + const auto& regs = gpu.regs; + // Upload all guest vertex arrays sequentially to our buffer + for (u32 index = 0; index < Maxwell::NumVertexArrays; ++index) { + if (!gpu.dirty.vertex_instance[index]) + continue; + + gpu.dirty.vertex_instance[index] = false; - gpu.dirty_flags.vertex_array.reset(); + if (regs.instanced_arrays.IsInstancingEnabled(index) && + regs.vertex_array[index].divisor != 0) { + // Enable vertex buffer instancing with the specified divisor. + glVertexArrayBindingDivisor(vao, index, regs.vertex_array[index].divisor); + } else { + // Disable the vertex buffer instancing. + glVertexArrayBindingDivisor(vao, index, 0); + } + } } GLintptr RasterizerOpenGL::SetupIndexBuffer() { @@ -298,9 +328,9 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) { Shader shader{shader_cache.GetStageProgram(program)}; - const auto stage_enum{static_cast<Maxwell::ShaderStage>(stage)}; + const auto stage_enum = static_cast<Maxwell::ShaderStage>(stage); SetupDrawConstBuffers(stage_enum, shader); - SetupGlobalRegions(stage_enum, shader); + SetupDrawGlobalMemory(stage_enum, shader); const auto texture_buffer_usage{SetupTextures(stage_enum, shader, base_bindings)}; const ProgramVariant variant{base_bindings, primitive_mode, texture_buffer_usage}; @@ -341,7 +371,7 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) { SyncClipEnabled(clip_distances); - gpu.dirty_flags.shaders = false; + gpu.dirty.shaders = false; } std::size_t RasterizerOpenGL::CalculateVertexArraysSize() const { @@ -424,13 +454,13 @@ std::pair<bool, bool> RasterizerOpenGL::ConfigureFramebuffers( const FramebufferConfigState fb_config_state{using_color_fb, using_depth_fb, preserve_contents, single_color_target}; - if (fb_config_state == current_framebuffer_config_state && - gpu.dirty_flags.color_buffer.none() && !gpu.dirty_flags.zeta_buffer) { + if (fb_config_state == current_framebuffer_config_state && !gpu.dirty.render_settings) { // Only skip if the previous ConfigureFramebuffers call was from the same kind (multiple or // single color targets). This is done because the guest registers may not change but the // host framebuffer may contain different attachments return current_depth_stencil_usage; } + gpu.dirty.render_settings = false; current_framebuffer_config_state = fb_config_state; texture_cache.GuardRenderTargets(true); @@ -519,13 +549,71 @@ std::pair<bool, bool> RasterizerOpenGL::ConfigureFramebuffers( return current_depth_stencil_usage = {static_cast<bool>(depth_surface), fbkey.stencil_enable}; } +void RasterizerOpenGL::ConfigureClearFramebuffer(OpenGLState& current_state, bool using_color_fb, + bool using_depth_fb, bool using_stencil_fb) { + auto& gpu = system.GPU().Maxwell3D(); + const auto& regs = gpu.regs; + + texture_cache.GuardRenderTargets(true); + View color_surface{}; + if (using_color_fb) { + color_surface = texture_cache.GetColorBufferSurface(regs.clear_buffers.RT, false); + } + View depth_surface{}; + if (using_depth_fb || using_stencil_fb) { + depth_surface = texture_cache.GetDepthBufferSurface(false); + } + texture_cache.GuardRenderTargets(false); + + current_state.draw.draw_framebuffer = clear_framebuffer.handle; + current_state.ApplyFramebufferState(); + + if (color_surface) { + color_surface->Attach(GL_COLOR_ATTACHMENT0, GL_DRAW_FRAMEBUFFER); + } else { + glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, 0, 0); + } + + if (depth_surface) { + const auto& params = depth_surface->GetSurfaceParams(); + switch (params.type) { + case VideoCore::Surface::SurfaceType::Depth: { + depth_surface->Attach(GL_DEPTH_ATTACHMENT, GL_DRAW_FRAMEBUFFER); + glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_STENCIL_ATTACHMENT, GL_TEXTURE_2D, 0, 0); + break; + } + case VideoCore::Surface::SurfaceType::DepthStencil: { + depth_surface->Attach(GL_DEPTH_ATTACHMENT, GL_DRAW_FRAMEBUFFER); + break; + } + default: { UNIMPLEMENTED(); } + } + } else { + glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, GL_TEXTURE_2D, 0, + 0); + } +} + void RasterizerOpenGL::Clear() { - const auto& regs = system.GPU().Maxwell3D().regs; + const auto& maxwell3d = system.GPU().Maxwell3D(); + + if (!maxwell3d.ShouldExecute()) { + return; + } + + const auto& regs = maxwell3d.regs; bool use_color{}; bool use_depth{}; bool use_stencil{}; - OpenGLState clear_state; + OpenGLState prev_state{OpenGLState::GetCurState()}; + SCOPE_EXIT({ + prev_state.AllDirty(); + prev_state.Apply(); + }); + + OpenGLState clear_state{OpenGLState::GetCurState()}; + clear_state.SetDefaultViewports(); if (regs.clear_buffers.R || regs.clear_buffers.G || regs.clear_buffers.B || regs.clear_buffers.A) { use_color = true; @@ -545,6 +633,7 @@ void RasterizerOpenGL::Clear() { // true. clear_state.depth.test_enabled = true; clear_state.depth.test_func = GL_ALWAYS; + clear_state.depth.write_mask = GL_TRUE; } if (regs.clear_buffers.S) { ASSERT_MSG(regs.zeta_enable != 0, "Tried to clear stencil but buffer is not enabled!"); @@ -581,8 +670,9 @@ void RasterizerOpenGL::Clear() { return; } - const auto [clear_depth, clear_stencil] = ConfigureFramebuffers( - clear_state, use_color, use_depth || use_stencil, false, regs.clear_buffers.RT.Value()); + ConfigureClearFramebuffer(clear_state, use_color, use_depth, use_stencil); + + SyncViewport(clear_state); if (regs.clear_flags.scissor) { SyncScissorTest(clear_state); } @@ -591,21 +681,18 @@ void RasterizerOpenGL::Clear() { clear_state.EmulateViewportWithScissor(); } - clear_state.ApplyColorMask(); - clear_state.ApplyDepth(); - clear_state.ApplyStencilTest(); - clear_state.ApplyViewport(); - clear_state.ApplyFramebufferState(); + clear_state.AllDirty(); + clear_state.Apply(); if (use_color) { - glClearBufferfv(GL_COLOR, regs.clear_buffers.RT, regs.clear_color); + glClearBufferfv(GL_COLOR, 0, regs.clear_color); } - if (clear_depth && clear_stencil) { + if (use_depth && use_stencil) { glClearBufferfi(GL_DEPTH_STENCIL, 0, regs.clear_depth, regs.clear_stencil); - } else if (clear_depth) { + } else if (use_depth) { glClearBufferfv(GL_DEPTH, 0, ®s.clear_depth); - } else if (clear_stencil) { + } else if (use_stencil) { glClearBufferiv(GL_STENCIL, 0, ®s.clear_stencil); } } @@ -616,6 +703,11 @@ void RasterizerOpenGL::DrawArrays() { MICROPROFILE_SCOPE(OpenGL_Drawing); auto& gpu = system.GPU().Maxwell3D(); + + if (!gpu.ShouldExecute()) { + return; + } + const auto& regs = gpu.regs; SyncColorMask(); @@ -661,6 +753,7 @@ void RasterizerOpenGL::DrawArrays() { // Upload vertex and index data. SetupVertexBuffer(vao); + SetupVertexInstances(vao); const GLintptr index_buffer_offset = SetupIndexBuffer(); // Setup draw parameters. It will automatically choose what glDraw* method to use. @@ -687,7 +780,7 @@ void RasterizerOpenGL::DrawArrays() { if (invalidate) { // As all cached buffers are invalidated, we need to recheck their state. - gpu.dirty_flags.vertex_array.set(); + gpu.dirty.ResetVertexArrays(); } shader_program_manager->ApplyTo(state); @@ -700,6 +793,46 @@ void RasterizerOpenGL::DrawArrays() { params.DispatchDraw(); accelerate_draw = AccelDraw::Disabled; + gpu.dirty.memory_general = false; +} + +void RasterizerOpenGL::DispatchCompute(GPUVAddr code_addr) { + if (!GLAD_GL_ARB_compute_variable_group_size) { + LOG_ERROR(Render_OpenGL, "Compute is currently not supported on this device due to the " + "lack of GL_ARB_compute_variable_group_size"); + return; + } + + auto kernel = shader_cache.GetComputeKernel(code_addr); + const auto [program, next_bindings] = kernel->GetProgramHandle({}); + state.draw.shader_program = program; + state.draw.program_pipeline = 0; + + const std::size_t buffer_size = + Tegra::Engines::KeplerCompute::NumConstBuffers * + (Maxwell::MaxConstBufferSize + device.GetUniformBufferAlignment()); + buffer_cache.Map(buffer_size); + + bind_ubo_pushbuffer.Setup(0); + bind_ssbo_pushbuffer.Setup(0); + + SetupComputeConstBuffers(kernel); + SetupComputeGlobalMemory(kernel); + + // TODO(Rodrigo): Bind images and samplers + + buffer_cache.Unmap(); + + bind_ubo_pushbuffer.Bind(); + bind_ssbo_pushbuffer.Bind(); + + state.ApplyShaderProgram(); + state.ApplyProgramPipeline(); + + const auto& launch_desc = system.GPU().KeplerCompute().launch_description; + glDispatchComputeGroupSizeARB(launch_desc.grid_dim_x, launch_desc.grid_dim_y, + launch_desc.grid_dim_z, launch_desc.block_dim_x, + launch_desc.block_dim_y, launch_desc.block_dim_z); } void RasterizerOpenGL::FlushAll() {} @@ -775,12 +908,25 @@ bool RasterizerOpenGL::AccelerateDisplay(const Tegra::FramebufferConfig& config, void RasterizerOpenGL::SetupDrawConstBuffers(Tegra::Engines::Maxwell3D::Regs::ShaderStage stage, const Shader& shader) { MICROPROFILE_SCOPE(OpenGL_UBO); - const auto stage_index = static_cast<std::size_t>(stage); - const auto& shader_stage = system.GPU().Maxwell3D().state.shader_stages[stage_index]; - - // Upload only the enabled buffers from the 16 constbuffers of each shader stage + const auto& stages = system.GPU().Maxwell3D().state.shader_stages; + const auto& shader_stage = stages[static_cast<std::size_t>(stage)]; for (const auto& entry : shader->GetShaderEntries().const_buffers) { - SetupConstBuffer(shader_stage.const_buffers[entry.GetIndex()], entry); + const auto& buffer = shader_stage.const_buffers[entry.GetIndex()]; + SetupConstBuffer(buffer, entry); + } +} + +void RasterizerOpenGL::SetupComputeConstBuffers(const Shader& kernel) { + MICROPROFILE_SCOPE(OpenGL_UBO); + const auto& launch_desc = system.GPU().KeplerCompute().launch_description; + for (const auto& entry : kernel->GetShaderEntries().const_buffers) { + const auto& config = launch_desc.const_buffer_config[entry.GetIndex()]; + const std::bitset<8> mask = launch_desc.memory_config.const_buffer_enable_mask.Value(); + Tegra::Engines::ConstBufferInfo buffer; + buffer.address = config.Address(); + buffer.size = config.size; + buffer.enabled = mask[entry.GetIndex()]; + SetupConstBuffer(buffer, entry); } } @@ -801,24 +947,39 @@ void RasterizerOpenGL::SetupConstBuffer(const Tegra::Engines::ConstBufferInfo& b bind_ubo_pushbuffer.Push(cbuf, offset, size); } -void RasterizerOpenGL::SetupGlobalRegions(Tegra::Engines::Maxwell3D::Regs::ShaderStage stage, - const Shader& shader) { +void RasterizerOpenGL::SetupDrawGlobalMemory(Tegra::Engines::Maxwell3D::Regs::ShaderStage stage, + const Shader& shader) { auto& gpu{system.GPU()}; auto& memory_manager{gpu.MemoryManager()}; const auto cbufs{gpu.Maxwell3D().state.shader_stages[static_cast<std::size_t>(stage)]}; - const auto alignment{device.GetShaderStorageBufferAlignment()}; - for (const auto& entry : shader->GetShaderEntries().global_memory_entries) { const auto addr{cbufs.const_buffers[entry.GetCbufIndex()].address + entry.GetCbufOffset()}; - const auto actual_addr{memory_manager.Read<u64>(addr)}; + const auto gpu_addr{memory_manager.Read<u64>(addr)}; const auto size{memory_manager.Read<u32>(addr + 8)}; + SetupGlobalMemory(entry, gpu_addr, size); + } +} - const auto [ssbo, buffer_offset] = - buffer_cache.UploadMemory(actual_addr, size, alignment, true, entry.IsWritten()); - bind_ssbo_pushbuffer.Push(ssbo, buffer_offset, static_cast<GLsizeiptr>(size)); +void RasterizerOpenGL::SetupComputeGlobalMemory(const Shader& kernel) { + auto& gpu{system.GPU()}; + auto& memory_manager{gpu.MemoryManager()}; + const auto cbufs{gpu.KeplerCompute().launch_description.const_buffer_config}; + for (const auto& entry : kernel->GetShaderEntries().global_memory_entries) { + const auto addr{cbufs[entry.GetCbufIndex()].Address() + entry.GetCbufOffset()}; + const auto gpu_addr{memory_manager.Read<u64>(addr)}; + const auto size{memory_manager.Read<u32>(addr + 8)}; + SetupGlobalMemory(entry, gpu_addr, size); } } +void RasterizerOpenGL::SetupGlobalMemory(const GLShader::GlobalMemoryEntry& entry, + GPUVAddr gpu_addr, std::size_t size) { + const auto alignment{device.GetShaderStorageBufferAlignment()}; + const auto [ssbo, buffer_offset] = + buffer_cache.UploadMemory(gpu_addr, size, alignment, true, entry.IsWritten()); + bind_ssbo_pushbuffer.Push(ssbo, buffer_offset, static_cast<GLsizeiptr>(size)); +} + TextureBufferUsage RasterizerOpenGL::SetupTextures(Maxwell::ShaderStage stage, const Shader& shader, BaseBindings base_bindings) { MICROPROFILE_SCOPE(OpenGL_Texture); @@ -907,10 +1068,11 @@ void RasterizerOpenGL::SyncClipCoef() { } void RasterizerOpenGL::SyncCullMode() { - const auto& regs = system.GPU().Maxwell3D().regs; + auto& maxwell3d = system.GPU().Maxwell3D(); - state.cull.enabled = regs.cull.enabled != 0; + const auto& regs = maxwell3d.regs; + state.cull.enabled = regs.cull.enabled != 0; if (state.cull.enabled) { state.cull.front_face = MaxwellToGL::FrontFace(regs.cull.front_face); state.cull.mode = MaxwellToGL::CullFace(regs.cull.cull_face); @@ -943,16 +1105,21 @@ void RasterizerOpenGL::SyncDepthTestState() { state.depth.test_enabled = regs.depth_test_enable != 0; state.depth.write_mask = regs.depth_write_enabled ? GL_TRUE : GL_FALSE; - if (!state.depth.test_enabled) + if (!state.depth.test_enabled) { return; + } state.depth.test_func = MaxwellToGL::ComparisonOp(regs.depth_test_func); } void RasterizerOpenGL::SyncStencilTestState() { - const auto& regs = system.GPU().Maxwell3D().regs; - state.stencil.test_enabled = regs.stencil_enable != 0; + auto& maxwell3d = system.GPU().Maxwell3D(); + if (!maxwell3d.dirty.stencil_test) { + return; + } + const auto& regs = maxwell3d.regs; + state.stencil.test_enabled = regs.stencil_enable != 0; if (!regs.stencil_enable) { return; } @@ -981,10 +1148,17 @@ void RasterizerOpenGL::SyncStencilTestState() { state.stencil.back.action_depth_fail = GL_KEEP; state.stencil.back.action_depth_pass = GL_KEEP; } + state.MarkDirtyStencilState(); + maxwell3d.dirty.stencil_test = false; } void RasterizerOpenGL::SyncColorMask() { - const auto& regs = system.GPU().Maxwell3D().regs; + auto& maxwell3d = system.GPU().Maxwell3D(); + if (!maxwell3d.dirty.color_mask) { + return; + } + const auto& regs = maxwell3d.regs; + const std::size_t count = regs.independent_blend_enable ? Tegra::Engines::Maxwell3D::Regs::NumRenderTargets : 1; for (std::size_t i = 0; i < count; i++) { @@ -995,6 +1169,9 @@ void RasterizerOpenGL::SyncColorMask() { dest.blue_enabled = (source.B == 0) ? GL_FALSE : GL_TRUE; dest.alpha_enabled = (source.A == 0) ? GL_FALSE : GL_TRUE; } + + state.MarkDirtyColorMask(); + maxwell3d.dirty.color_mask = false; } void RasterizerOpenGL::SyncMultiSampleState() { @@ -1009,7 +1186,11 @@ void RasterizerOpenGL::SyncFragmentColorClampState() { } void RasterizerOpenGL::SyncBlendState() { - const auto& regs = system.GPU().Maxwell3D().regs; + auto& maxwell3d = system.GPU().Maxwell3D(); + if (!maxwell3d.dirty.blend_state) { + return; + } + const auto& regs = maxwell3d.regs; state.blend_color.red = regs.blend_color.r; state.blend_color.green = regs.blend_color.g; @@ -1032,6 +1213,8 @@ void RasterizerOpenGL::SyncBlendState() { for (std::size_t i = 1; i < Tegra::Engines::Maxwell3D::Regs::NumRenderTargets; i++) { state.blend[i].enabled = false; } + maxwell3d.dirty.blend_state = false; + state.MarkDirtyBlendState(); return; } @@ -1048,6 +1231,9 @@ void RasterizerOpenGL::SyncBlendState() { blend.src_a_func = MaxwellToGL::BlendFunc(src.factor_source_a); blend.dst_a_func = MaxwellToGL::BlendFunc(src.factor_dest_a); } + + state.MarkDirtyBlendState(); + maxwell3d.dirty.blend_state = false; } void RasterizerOpenGL::SyncLogicOpState() { @@ -1099,13 +1285,21 @@ void RasterizerOpenGL::SyncPointState() { } void RasterizerOpenGL::SyncPolygonOffset() { - const auto& regs = system.GPU().Maxwell3D().regs; + auto& maxwell3d = system.GPU().Maxwell3D(); + if (!maxwell3d.dirty.polygon_offset) { + return; + } + const auto& regs = maxwell3d.regs; + state.polygon_offset.fill_enable = regs.polygon_offset_fill_enable != 0; state.polygon_offset.line_enable = regs.polygon_offset_line_enable != 0; state.polygon_offset.point_enable = regs.polygon_offset_point_enable != 0; state.polygon_offset.units = regs.polygon_offset_units; state.polygon_offset.factor = regs.polygon_offset_factor; state.polygon_offset.clamp = regs.polygon_offset_clamp; + + state.MarkDirtyPolygonOffset(); + maxwell3d.dirty.polygon_offset = false; } void RasterizerOpenGL::SyncAlphaTest() { diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h index 40b571d58..8b123c48d 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.h +++ b/src/video_core/renderer_opengl/gl_rasterizer.h @@ -58,6 +58,7 @@ public: void DrawArrays() override; void Clear() override; + void DispatchCompute(GPUVAddr code_addr) override; void FlushAll() override; void FlushRegion(CacheAddr addr, u64 size) override; void InvalidateRegion(CacheAddr addr, u64 size) override; @@ -108,17 +109,30 @@ private: OpenGLState& current_state, bool using_color_fb = true, bool using_depth_fb = true, bool preserve_contents = true, std::optional<std::size_t> single_color_target = {}); + void ConfigureClearFramebuffer(OpenGLState& current_state, bool using_color_fb, + bool using_depth_fb, bool using_stencil_fb); + /// Configures the current constbuffers to use for the draw command. void SetupDrawConstBuffers(Tegra::Engines::Maxwell3D::Regs::ShaderStage stage, const Shader& shader); + /// Configures the current constbuffers to use for the kernel invocation. + void SetupComputeConstBuffers(const Shader& kernel); + /// Configures a constant buffer. void SetupConstBuffer(const Tegra::Engines::ConstBufferInfo& buffer, const GLShader::ConstBufferEntry& entry); /// Configures the current global memory entries to use for the draw command. - void SetupGlobalRegions(Tegra::Engines::Maxwell3D::Regs::ShaderStage stage, - const Shader& shader); + void SetupDrawGlobalMemory(Tegra::Engines::Maxwell3D::Regs::ShaderStage stage, + const Shader& shader); + + /// Configures the current global memory entries to use for the kernel invocation. + void SetupComputeGlobalMemory(const Shader& kernel); + + /// Configures a constant buffer. + void SetupGlobalMemory(const GLShader::GlobalMemoryEntry& entry, GPUVAddr gpu_addr, + std::size_t size); /// Configures the current textures to use for the draw command. Returns shaders texture buffer /// usage. @@ -216,6 +230,7 @@ private: GLuint SetupVertexFormat(); void SetupVertexBuffer(GLuint vao); + void SetupVertexInstances(GLuint vao); GLintptr SetupIndexBuffer(); @@ -226,6 +241,8 @@ private: enum class AccelDraw { Disabled, Arrays, Indexed }; AccelDraw accelerate_draw = AccelDraw::Disabled; + OGLFramebuffer clear_framebuffer; + using CachedPageMap = boost::icl::interval_map<u64, int>; CachedPageMap cached_pages; }; diff --git a/src/video_core/renderer_opengl/gl_shader_cache.cpp b/src/video_core/renderer_opengl/gl_shader_cache.cpp index 32dd9eae7..1c90facc3 100644 --- a/src/video_core/renderer_opengl/gl_shader_cache.cpp +++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp @@ -23,13 +23,13 @@ namespace OpenGL { using VideoCommon::Shader::ProgramCode; -// One UBO is always reserved for emulation values -constexpr u32 RESERVED_UBOS = 1; +// One UBO is always reserved for emulation values on staged shaders +constexpr u32 STAGE_RESERVED_UBOS = 1; struct UnspecializedShader { std::string code; GLShader::ShaderEntries entries; - Maxwell::ShaderProgram program_type; + ProgramType program_type; }; namespace { @@ -55,15 +55,17 @@ ProgramCode GetShaderCode(Tegra::MemoryManager& memory_manager, const GPUVAddr g } /// Gets the shader type from a Maxwell program type -constexpr GLenum GetShaderType(Maxwell::ShaderProgram program_type) { +constexpr GLenum GetShaderType(ProgramType program_type) { switch (program_type) { - case Maxwell::ShaderProgram::VertexA: - case Maxwell::ShaderProgram::VertexB: + case ProgramType::VertexA: + case ProgramType::VertexB: return GL_VERTEX_SHADER; - case Maxwell::ShaderProgram::Geometry: + case ProgramType::Geometry: return GL_GEOMETRY_SHADER; - case Maxwell::ShaderProgram::Fragment: + case ProgramType::Fragment: return GL_FRAGMENT_SHADER; + case ProgramType::Compute: + return GL_COMPUTE_SHADER; default: return GL_NONE; } @@ -100,6 +102,25 @@ constexpr std::tuple<const char*, const char*, u32> GetPrimitiveDescription(GLen } } +ProgramType GetProgramType(Maxwell::ShaderProgram program) { + switch (program) { + case Maxwell::ShaderProgram::VertexA: + return ProgramType::VertexA; + case Maxwell::ShaderProgram::VertexB: + return ProgramType::VertexB; + case Maxwell::ShaderProgram::TesselationControl: + return ProgramType::TessellationControl; + case Maxwell::ShaderProgram::TesselationEval: + return ProgramType::TessellationEval; + case Maxwell::ShaderProgram::Geometry: + return ProgramType::Geometry; + case Maxwell::ShaderProgram::Fragment: + return ProgramType::Fragment; + } + UNREACHABLE(); + return {}; +} + /// Calculates the size of a program stream std::size_t CalculateProgramSize(const GLShader::ProgramCode& program) { constexpr std::size_t start_offset = 10; @@ -128,13 +149,13 @@ std::size_t CalculateProgramSize(const GLShader::ProgramCode& program) { } /// Hashes one (or two) program streams -u64 GetUniqueIdentifier(Maxwell::ShaderProgram program_type, const ProgramCode& code, +u64 GetUniqueIdentifier(ProgramType program_type, const ProgramCode& code, const ProgramCode& code_b, std::size_t size_a = 0, std::size_t size_b = 0) { if (size_a == 0) { size_a = CalculateProgramSize(code); } u64 unique_identifier = Common::CityHash64(reinterpret_cast<const char*>(code.data()), size_a); - if (program_type != Maxwell::ShaderProgram::VertexA) { + if (program_type != ProgramType::VertexA) { return unique_identifier; } // VertexA programs include two programs @@ -152,12 +173,12 @@ u64 GetUniqueIdentifier(Maxwell::ShaderProgram program_type, const ProgramCode& } /// Creates an unspecialized program from code streams -GLShader::ProgramResult CreateProgram(const Device& device, Maxwell::ShaderProgram program_type, +GLShader::ProgramResult CreateProgram(const Device& device, ProgramType program_type, ProgramCode program_code, ProgramCode program_code_b) { GLShader::ShaderSetup setup(program_code); setup.program.size_a = CalculateProgramSize(program_code); setup.program.size_b = 0; - if (program_type == Maxwell::ShaderProgram::VertexA) { + if (program_type == ProgramType::VertexA) { // VertexB is always enabled, so when VertexA is enabled, we have two vertex shaders. // Conventional HW does not support this, so we combine VertexA and VertexB into one // stage here. @@ -168,22 +189,23 @@ GLShader::ProgramResult CreateProgram(const Device& device, Maxwell::ShaderProgr program_type, program_code, program_code_b, setup.program.size_a, setup.program.size_b); switch (program_type) { - case Maxwell::ShaderProgram::VertexA: - case Maxwell::ShaderProgram::VertexB: + case ProgramType::VertexA: + case ProgramType::VertexB: return GLShader::GenerateVertexShader(device, setup); - case Maxwell::ShaderProgram::Geometry: + case ProgramType::Geometry: return GLShader::GenerateGeometryShader(device, setup); - case Maxwell::ShaderProgram::Fragment: + case ProgramType::Fragment: return GLShader::GenerateFragmentShader(device, setup); + case ProgramType::Compute: + return GLShader::GenerateComputeShader(device, setup); default: - LOG_CRITICAL(HW_GPU, "Unimplemented program_type={}", static_cast<u32>(program_type)); - UNREACHABLE(); + UNIMPLEMENTED_MSG("Unimplemented program_type={}", static_cast<u32>(program_type)); return {}; } } CachedProgram SpecializeShader(const std::string& code, const GLShader::ShaderEntries& entries, - Maxwell::ShaderProgram program_type, const ProgramVariant& variant, + ProgramType program_type, const ProgramVariant& variant, bool hint_retrievable = false) { auto base_bindings{variant.base_bindings}; const auto primitive_mode{variant.primitive_mode}; @@ -194,7 +216,14 @@ CachedProgram SpecializeShader(const std::string& code, const GLShader::ShaderEn if (entries.shader_viewport_layer_array) { source += "#extension GL_ARB_shader_viewport_layer_array : enable\n"; } - source += fmt::format("\n#define EMULATION_UBO_BINDING {}\n", base_bindings.cbuf++); + if (program_type == ProgramType::Compute) { + source += "#extension GL_ARB_compute_variable_group_size : require\n"; + } + source += '\n'; + + if (program_type != ProgramType::Compute) { + source += fmt::format("#define EMULATION_UBO_BINDING {}\n", base_bindings.cbuf++); + } for (const auto& cbuf : entries.const_buffers) { source += @@ -221,13 +250,16 @@ CachedProgram SpecializeShader(const std::string& code, const GLShader::ShaderEn source += fmt::format("#define SAMPLER_{}_IS_BUFFER", i); } - if (program_type == Maxwell::ShaderProgram::Geometry) { + if (program_type == ProgramType::Geometry) { const auto [glsl_topology, debug_name, max_vertices] = GetPrimitiveDescription(primitive_mode); source += "layout (" + std::string(glsl_topology) + ") in;\n"; source += "#define MAX_VERTEX_INPUT " + std::to_string(max_vertices) + '\n'; } + if (program_type == ProgramType::Compute) { + source += "layout (local_size_variable) in;\n"; + } source += code; @@ -255,7 +287,7 @@ std::set<GLenum> GetSupportedFormats() { } // Anonymous namespace -CachedShader::CachedShader(const ShaderParameters& params, Maxwell::ShaderProgram program_type, +CachedShader::CachedShader(const ShaderParameters& params, ProgramType program_type, GLShader::ProgramResult result) : RasterizerCacheObject{params.host_ptr}, host_ptr{params.host_ptr}, cpu_addr{params.cpu_addr}, unique_identifier{params.unique_identifier}, program_type{program_type}, @@ -268,29 +300,50 @@ Shader CachedShader::CreateStageFromMemory(const ShaderParameters& params, ProgramCode&& program_code_b) { const auto code_size{CalculateProgramSize(program_code)}; const auto code_size_b{CalculateProgramSize(program_code_b)}; - auto result{CreateProgram(params.device, program_type, program_code, program_code_b)}; + auto result{ + CreateProgram(params.device, GetProgramType(program_type), program_code, program_code_b)}; if (result.first.empty()) { // TODO(Rodrigo): Unimplemented shader stages hit here, avoid using these for now return {}; } params.disk_cache.SaveRaw(ShaderDiskCacheRaw( - params.unique_identifier, program_type, static_cast<u32>(code_size / sizeof(u64)), - static_cast<u32>(code_size_b / sizeof(u64)), std::move(program_code), - std::move(program_code_b))); + params.unique_identifier, GetProgramType(program_type), + static_cast<u32>(code_size / sizeof(u64)), static_cast<u32>(code_size_b / sizeof(u64)), + std::move(program_code), std::move(program_code_b))); - return std::shared_ptr<CachedShader>(new CachedShader(params, program_type, std::move(result))); + return std::shared_ptr<CachedShader>( + new CachedShader(params, GetProgramType(program_type), std::move(result))); } Shader CachedShader::CreateStageFromCache(const ShaderParameters& params, Maxwell::ShaderProgram program_type, GLShader::ProgramResult result) { - return std::shared_ptr<CachedShader>(new CachedShader(params, program_type, std::move(result))); + return std::shared_ptr<CachedShader>( + new CachedShader(params, GetProgramType(program_type), std::move(result))); +} + +Shader CachedShader::CreateKernelFromMemory(const ShaderParameters& params, ProgramCode&& code) { + auto result{CreateProgram(params.device, ProgramType::Compute, code, {})}; + + const auto code_size{CalculateProgramSize(code)}; + params.disk_cache.SaveRaw(ShaderDiskCacheRaw(params.unique_identifier, ProgramType::Compute, + static_cast<u32>(code_size / sizeof(u64)), 0, + std::move(code), {})); + + return std::shared_ptr<CachedShader>( + new CachedShader(params, ProgramType::Compute, std::move(result))); +} + +Shader CachedShader::CreateKernelFromCache(const ShaderParameters& params, + GLShader::ProgramResult result) { + return std::shared_ptr<CachedShader>( + new CachedShader(params, ProgramType::Compute, std::move(result))); } std::tuple<GLuint, BaseBindings> CachedShader::GetProgramHandle(const ProgramVariant& variant) { GLuint handle{}; - if (program_type == Maxwell::ShaderProgram::Geometry) { + if (program_type == ProgramType::Geometry) { handle = GetGeometryShader(variant); } else { const auto [entry, is_cache_miss] = programs.try_emplace(variant); @@ -308,8 +361,11 @@ std::tuple<GLuint, BaseBindings> CachedShader::GetProgramHandle(const ProgramVar handle = program->handle; } - auto base_bindings{variant.base_bindings}; - base_bindings.cbuf += static_cast<u32>(entries.const_buffers.size()) + RESERVED_UBOS; + auto base_bindings = variant.base_bindings; + base_bindings.cbuf += static_cast<u32>(entries.const_buffers.size()); + if (program_type != ProgramType::Compute) { + base_bindings.cbuf += STAGE_RESERVED_UBOS; + } base_bindings.gmem += static_cast<u32>(entries.global_memory_entries.size()); base_bindings.sampler += static_cast<u32>(entries.samplers.size()); @@ -572,7 +628,7 @@ std::unordered_map<u64, UnspecializedShader> ShaderCacheOpenGL::GenerateUnspecia } Shader ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program) { - if (!system.GPU().Maxwell3D().dirty_flags.shaders) { + if (!system.GPU().Maxwell3D().dirty.shaders) { return last_shaders[static_cast<std::size_t>(program)]; } @@ -589,13 +645,15 @@ Shader ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program) { // No shader found - create a new one ProgramCode program_code{GetShaderCode(memory_manager, program_addr, host_ptr)}; ProgramCode program_code_b; - if (program == Maxwell::ShaderProgram::VertexA) { + const bool is_program_a{program == Maxwell::ShaderProgram::VertexA}; + if (is_program_a) { const GPUVAddr program_addr_b{GetShaderAddress(system, Maxwell::ShaderProgram::VertexB)}; program_code_b = GetShaderCode(memory_manager, program_addr_b, memory_manager.GetPointer(program_addr_b)); } - const auto unique_identifier = GetUniqueIdentifier(program, program_code, program_code_b); + const auto unique_identifier = + GetUniqueIdentifier(GetProgramType(program), program_code, program_code_b); const auto cpu_addr{*memory_manager.GpuToCpuAddress(program_addr)}; const ShaderParameters params{disk_cache, precompiled_programs, device, cpu_addr, host_ptr, unique_identifier}; @@ -612,4 +670,30 @@ Shader ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program) { return last_shaders[static_cast<std::size_t>(program)] = shader; } +Shader ShaderCacheOpenGL::GetComputeKernel(GPUVAddr code_addr) { + auto& memory_manager{system.GPU().MemoryManager()}; + const auto host_ptr{memory_manager.GetPointer(code_addr)}; + auto kernel = TryGet(host_ptr); + if (kernel) { + return kernel; + } + + // No kernel found - create a new one + auto code{GetShaderCode(memory_manager, code_addr, host_ptr)}; + const auto unique_identifier{GetUniqueIdentifier(ProgramType::Compute, code, {})}; + const auto cpu_addr{*memory_manager.GpuToCpuAddress(code_addr)}; + const ShaderParameters params{disk_cache, precompiled_programs, device, cpu_addr, + host_ptr, unique_identifier}; + + const auto found = precompiled_shaders.find(unique_identifier); + if (found == precompiled_shaders.end()) { + kernel = CachedShader::CreateKernelFromMemory(params, std::move(code)); + } else { + kernel = CachedShader::CreateKernelFromCache(params, found->second); + } + + Register(kernel); + return kernel; +} + } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_shader_cache.h b/src/video_core/renderer_opengl/gl_shader_cache.h index bbb53cdf4..a3106a0ff 100644 --- a/src/video_core/renderer_opengl/gl_shader_cache.h +++ b/src/video_core/renderer_opengl/gl_shader_cache.h @@ -61,6 +61,11 @@ public: Maxwell::ShaderProgram program_type, GLShader::ProgramResult result); + static Shader CreateKernelFromMemory(const ShaderParameters& params, ProgramCode&& code); + + static Shader CreateKernelFromCache(const ShaderParameters& params, + GLShader::ProgramResult result); + VAddr GetCpuAddr() const override { return cpu_addr; } @@ -78,7 +83,7 @@ public: std::tuple<GLuint, BaseBindings> GetProgramHandle(const ProgramVariant& variant); private: - explicit CachedShader(const ShaderParameters& params, Maxwell::ShaderProgram program_type, + explicit CachedShader(const ShaderParameters& params, ProgramType program_type, GLShader::ProgramResult result); // Geometry programs. These are needed because GLSL needs an input topology but it's not @@ -104,7 +109,7 @@ private: u8* host_ptr{}; VAddr cpu_addr{}; u64 unique_identifier{}; - Maxwell::ShaderProgram program_type{}; + ProgramType program_type{}; ShaderDiskCacheOpenGL& disk_cache; const PrecompiledPrograms& precompiled_programs; @@ -132,6 +137,9 @@ public: /// Gets the current specified shader stage program Shader GetStageProgram(Maxwell::ShaderProgram program); + /// Gets a compute kernel in the passed address + Shader GetComputeKernel(GPUVAddr code_addr); + protected: // We do not have to flush this cache as things in it are never modified by us. void FlushObjectInner(const Shader& object) override {} diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp index 119073776..ffe26b241 100644 --- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp +++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp @@ -37,7 +37,6 @@ using namespace std::string_literals; using namespace VideoCommon::Shader; using Maxwell = Tegra::Engines::Maxwell3D::Regs; -using ShaderStage = Tegra::Engines::Maxwell3D::Regs::ShaderStage; using Operation = const OperationNode&; enum class Type { Bool, Bool2, Float, Int, Uint, HalfFloat }; @@ -162,9 +161,13 @@ std::string FlowStackTopName(MetaStackClass stack) { return fmt::format("{}_flow_stack_top", GetFlowStackPrefix(stack)); } +constexpr bool IsVertexShader(ProgramType stage) { + return stage == ProgramType::VertexA || stage == ProgramType::VertexB; +} + class GLSLDecompiler final { public: - explicit GLSLDecompiler(const Device& device, const ShaderIR& ir, ShaderStage stage, + explicit GLSLDecompiler(const Device& device, const ShaderIR& ir, ProgramType stage, std::string suffix) : device{device}, ir{ir}, stage{stage}, suffix{suffix}, header{ir.GetHeader()} {} @@ -248,25 +251,21 @@ public: } entries.clip_distances = ir.GetClipDistances(); entries.shader_viewport_layer_array = - stage == ShaderStage::Vertex && (ir.UsesLayer() || ir.UsesViewportIndex()); + IsVertexShader(stage) && (ir.UsesLayer() || ir.UsesViewportIndex()); entries.shader_length = ir.GetLength(); return entries; } private: - using OperationDecompilerFn = std::string (GLSLDecompiler::*)(Operation); - using OperationDecompilersArray = - std::array<OperationDecompilerFn, static_cast<std::size_t>(OperationCode::Amount)>; - void DeclareVertex() { - if (stage != ShaderStage::Vertex) + if (!IsVertexShader(stage)) return; DeclareVertexRedeclarations(); } void DeclareGeometry() { - if (stage != ShaderStage::Geometry) { + if (stage != ProgramType::Geometry) { return; } @@ -297,14 +296,14 @@ private: break; } } - if (stage != ShaderStage::Vertex || device.HasVertexViewportLayer()) { + if (!IsVertexShader(stage) || device.HasVertexViewportLayer()) { if (ir.UsesLayer()) { code.AddLine("int gl_Layer;"); } if (ir.UsesViewportIndex()) { code.AddLine("int gl_ViewportIndex;"); } - } else if ((ir.UsesLayer() || ir.UsesViewportIndex()) && stage == ShaderStage::Vertex && + } else if ((ir.UsesLayer() || ir.UsesViewportIndex()) && IsVertexShader(stage) && !device.HasVertexViewportLayer()) { LOG_ERROR( Render_OpenGL, @@ -341,11 +340,16 @@ private: } void DeclareLocalMemory() { - if (const u64 local_memory_size = header.GetLocalMemorySize(); local_memory_size > 0) { - const auto element_count = Common::AlignUp(local_memory_size, 4) / 4; - code.AddLine("float {}[{}];", GetLocalMemory(), element_count); - code.AddNewLine(); + // TODO(Rodrigo): Unstub kernel local memory size and pass it from a register at + // specialization time. + const u64 local_memory_size = + stage == ProgramType::Compute ? 0x400 : header.GetLocalMemorySize(); + if (local_memory_size == 0) { + return; } + const auto element_count = Common::AlignUp(local_memory_size, 4) / 4; + code.AddLine("float {}[{}];", GetLocalMemory(), element_count); + code.AddNewLine(); } void DeclareInternalFlags() { @@ -399,12 +403,12 @@ private: const u32 location{GetGenericAttributeIndex(index)}; std::string name{GetInputAttribute(index)}; - if (stage == ShaderStage::Geometry) { + if (stage == ProgramType::Geometry) { name = "gs_" + name + "[]"; } std::string suffix; - if (stage == ShaderStage::Fragment) { + if (stage == ProgramType::Fragment) { const auto input_mode{header.ps.GetAttributeUse(location)}; if (skip_unused && input_mode == AttributeUse::Unused) { return; @@ -416,7 +420,7 @@ private: } void DeclareOutputAttributes() { - if (ir.HasPhysicalAttributes() && stage != ShaderStage::Fragment) { + if (ir.HasPhysicalAttributes() && stage != ProgramType::Fragment) { for (u32 i = 0; i < GetNumPhysicalVaryings(); ++i) { DeclareOutputAttribute(ToGenericAttribute(i)); } @@ -538,7 +542,7 @@ private: constexpr u32 element_stride{4}; const u32 address{generic_base + index * generic_stride + element * element_stride}; - const bool declared{stage != ShaderStage::Fragment || + const bool declared{stage != ProgramType::Fragment || header.ps.GetAttributeUse(index) != AttributeUse::Unused}; const std::string value{declared ? ReadAttribute(attribute, element) : "0"}; code.AddLine("case 0x{:x}: return {};", address, value); @@ -642,7 +646,7 @@ private: } if (const auto abuf = std::get_if<AbufNode>(&*node)) { - UNIMPLEMENTED_IF_MSG(abuf->IsPhysicalBuffer() && stage == ShaderStage::Geometry, + UNIMPLEMENTED_IF_MSG(abuf->IsPhysicalBuffer() && stage == ProgramType::Geometry, "Physical attributes in geometry shaders are not implemented"); if (abuf->IsPhysicalBuffer()) { return fmt::format("readPhysicalAttribute(ftou({}))", @@ -697,6 +701,9 @@ private: } if (const auto lmem = std::get_if<LmemNode>(&*node)) { + if (stage == ProgramType::Compute) { + LOG_WARNING(Render_OpenGL, "Local memory is stubbed on compute shaders"); + } return fmt::format("{}[ftou({}) / 4]", GetLocalMemory(), Visit(lmem->GetAddress())); } @@ -726,7 +733,7 @@ private: std::string ReadAttribute(Attribute::Index attribute, u32 element, const Node& buffer = {}) { const auto GeometryPass = [&](std::string_view name) { - if (stage == ShaderStage::Geometry && buffer) { + if (stage == ProgramType::Geometry && buffer) { // TODO(Rodrigo): Guard geometry inputs against out of bound reads. Some games // set an 0x80000000 index for those and the shader fails to build. Find out why // this happens and what's its intent. @@ -738,10 +745,10 @@ private: switch (attribute) { case Attribute::Index::Position: switch (stage) { - case ShaderStage::Geometry: + case ProgramType::Geometry: return fmt::format("gl_in[ftou({})].gl_Position{}", Visit(buffer), GetSwizzle(element)); - case ShaderStage::Fragment: + case ProgramType::Fragment: return element == 3 ? "1.0f" : ("gl_FragCoord"s + GetSwizzle(element)); default: UNREACHABLE(); @@ -762,7 +769,7 @@ private: // TODO(Subv): Find out what the values are for the first two elements when inside a // vertex shader, and what's the value of the fourth element when inside a Tess Eval // shader. - ASSERT(stage == ShaderStage::Vertex); + ASSERT(IsVertexShader(stage)); switch (element) { case 2: // Config pack's first value is instance_id. @@ -774,7 +781,7 @@ private: return "0"; case Attribute::Index::FrontFacing: // TODO(Subv): Find out what the values are for the other elements. - ASSERT(stage == ShaderStage::Fragment); + ASSERT(stage == ProgramType::Fragment); switch (element) { case 3: return "itof(gl_FrontFacing ? -1 : 0)"; @@ -796,7 +803,7 @@ private: return value; } // There's a bug in NVidia's proprietary drivers that makes precise fail on fragment shaders - const std::string precise = stage != ShaderStage::Fragment ? "precise " : ""; + const std::string precise = stage != ProgramType::Fragment ? "precise " : ""; const std::string temporary = code.GenerateTemporary(); code.AddLine("{}float {} = {};", precise, temporary, value); @@ -831,12 +838,12 @@ private: UNIMPLEMENTED(); return {}; case 1: - if (stage == ShaderStage::Vertex && !device.HasVertexViewportLayer()) { + if (IsVertexShader(stage) && !device.HasVertexViewportLayer()) { return {}; } return std::make_pair("gl_Layer", true); case 2: - if (stage == ShaderStage::Vertex && !device.HasVertexViewportLayer()) { + if (IsVertexShader(stage) && !device.HasVertexViewportLayer()) { return {}; } return std::make_pair("gl_ViewportIndex", true); @@ -1073,6 +1080,9 @@ private: target = result->first; is_integer = result->second; } else if (const auto lmem = std::get_if<LmemNode>(&*dest)) { + if (stage == ProgramType::Compute) { + LOG_WARNING(Render_OpenGL, "Local memory is stubbed on compute shaders"); + } target = fmt::format("{}[ftou({}) / 4]", GetLocalMemory(), Visit(lmem->GetAddress())); } else if (const auto gmem = std::get_if<GmemNode>(&*dest)) { const std::string real = Visit(gmem->GetRealAddress()); @@ -1400,14 +1410,10 @@ private: return fmt::format("{}[{}]", pair, VisitOperand(operation, 1, Type::Uint)); } - std::string LogicalAll2(Operation operation) { + std::string LogicalAnd2(Operation operation) { return GenerateUnary(operation, "all", Type::Bool, Type::Bool2); } - std::string LogicalAny2(Operation operation) { - return GenerateUnary(operation, "any", Type::Bool, Type::Bool2); - } - template <bool with_nan> std::string GenerateHalfComparison(Operation operation, const std::string& compare_op) { const std::string comparison{GenerateBinaryCall(operation, compare_op, Type::Bool2, @@ -1630,7 +1636,7 @@ private: } std::string Exit(Operation operation) { - if (stage != ShaderStage::Fragment) { + if (stage != ProgramType::Fragment) { code.AddLine("return;"); return {}; } @@ -1681,7 +1687,7 @@ private: } std::string EmitVertex(Operation operation) { - ASSERT_MSG(stage == ShaderStage::Geometry, + ASSERT_MSG(stage == ProgramType::Geometry, "EmitVertex is expected to be used in a geometry shader."); // If a geometry shader is attached, it will always flip (it's the last stage before @@ -1692,7 +1698,7 @@ private: } std::string EndPrimitive(Operation operation) { - ASSERT_MSG(stage == ShaderStage::Geometry, + ASSERT_MSG(stage == ProgramType::Geometry, "EndPrimitive is expected to be used in a geometry shader."); code.AddLine("EndPrimitive();"); @@ -1714,7 +1720,7 @@ private: return "utof(gl_WorkGroupID"s + GetSwizzle(element) + ')'; } - static constexpr OperationDecompilersArray operation_decompilers = { + static constexpr std::array operation_decompilers = { &GLSLDecompiler::Assign, &GLSLDecompiler::Select, @@ -1798,8 +1804,7 @@ private: &GLSLDecompiler::LogicalXor, &GLSLDecompiler::LogicalNegate, &GLSLDecompiler::LogicalPick2, - &GLSLDecompiler::LogicalAll2, - &GLSLDecompiler::LogicalAny2, + &GLSLDecompiler::LogicalAnd2, &GLSLDecompiler::LogicalLessThan<Type::Float>, &GLSLDecompiler::LogicalEqual<Type::Float>, @@ -1863,6 +1868,7 @@ private: &GLSLDecompiler::WorkGroupId<1>, &GLSLDecompiler::WorkGroupId<2>, }; + static_assert(operation_decompilers.size() == static_cast<std::size_t>(OperationCode::Amount)); std::string GetRegister(u32 index) const { return GetDeclarationWithSuffix(index, "gpr"); @@ -1927,7 +1933,7 @@ private: } u32 GetNumPhysicalInputAttributes() const { - return stage == ShaderStage::Vertex ? GetNumPhysicalAttributes() : GetNumPhysicalVaryings(); + return IsVertexShader(stage) ? GetNumPhysicalAttributes() : GetNumPhysicalVaryings(); } u32 GetNumPhysicalAttributes() const { @@ -1940,7 +1946,7 @@ private: const Device& device; const ShaderIR& ir; - const ShaderStage stage; + const ProgramType stage; const std::string suffix; const Header header; @@ -1971,7 +1977,7 @@ std::string GetCommonDeclarations() { MAX_CONSTBUFFER_ELEMENTS); } -ProgramResult Decompile(const Device& device, const ShaderIR& ir, Maxwell::ShaderStage stage, +ProgramResult Decompile(const Device& device, const ShaderIR& ir, ProgramType stage, const std::string& suffix) { GLSLDecompiler decompiler(device, ir, stage, suffix); decompiler.Decompile(); diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.h b/src/video_core/renderer_opengl/gl_shader_decompiler.h index 02586736d..2ea02f5bf 100644 --- a/src/video_core/renderer_opengl/gl_shader_decompiler.h +++ b/src/video_core/renderer_opengl/gl_shader_decompiler.h @@ -12,14 +12,26 @@ #include "video_core/engines/maxwell_3d.h" #include "video_core/shader/shader_ir.h" -namespace OpenGL { -class Device; -} - namespace VideoCommon::Shader { class ShaderIR; } +namespace OpenGL { + +class Device; + +enum class ProgramType : u32 { + VertexA = 0, + VertexB = 1, + TessellationControl = 2, + TessellationEval = 3, + Geometry = 4, + Fragment = 5, + Compute = 6 +}; + +} // namespace OpenGL + namespace OpenGL::GLShader { struct ShaderEntries; @@ -85,6 +97,6 @@ struct ShaderEntries { std::string GetCommonDeclarations(); ProgramResult Decompile(const Device& device, const VideoCommon::Shader::ShaderIR& ir, - Maxwell::ShaderStage stage, const std::string& suffix); + ProgramType stage, const std::string& suffix); } // namespace OpenGL::GLShader diff --git a/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp b/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp index 7893d1e26..969fe9ced 100644 --- a/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp +++ b/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp @@ -51,7 +51,7 @@ ShaderCacheVersionHash GetShaderCacheVersionHash() { } // namespace -ShaderDiskCacheRaw::ShaderDiskCacheRaw(u64 unique_identifier, Maxwell::ShaderProgram program_type, +ShaderDiskCacheRaw::ShaderDiskCacheRaw(u64 unique_identifier, ProgramType program_type, u32 program_code_size, u32 program_code_size_b, ProgramCode program_code, ProgramCode program_code_b) : unique_identifier{unique_identifier}, program_type{program_type}, diff --git a/src/video_core/renderer_opengl/gl_shader_disk_cache.h b/src/video_core/renderer_opengl/gl_shader_disk_cache.h index 4f296dda6..cc8bbd61e 100644 --- a/src/video_core/renderer_opengl/gl_shader_disk_cache.h +++ b/src/video_core/renderer_opengl/gl_shader_disk_cache.h @@ -18,7 +18,6 @@ #include "common/assert.h" #include "common/common_types.h" #include "core/file_sys/vfs_vector.h" -#include "video_core/engines/maxwell_3d.h" #include "video_core/renderer_opengl/gl_shader_gen.h" namespace Core { @@ -34,14 +33,11 @@ namespace OpenGL { struct ShaderDiskCacheUsage; struct ShaderDiskCacheDump; -using ShaderDumpsMap = std::unordered_map<ShaderDiskCacheUsage, ShaderDiskCacheDump>; - using ProgramCode = std::vector<u64>; -using Maxwell = Tegra::Engines::Maxwell3D::Regs; - +using ShaderDumpsMap = std::unordered_map<ShaderDiskCacheUsage, ShaderDiskCacheDump>; using TextureBufferUsage = std::bitset<64>; -/// Allocated bindings used by an OpenGL shader program. +/// Allocated bindings used by an OpenGL shader program struct BaseBindings { u32 cbuf{}; u32 gmem{}; @@ -126,7 +122,7 @@ namespace OpenGL { /// Describes a shader how it's used by the guest GPU class ShaderDiskCacheRaw { public: - explicit ShaderDiskCacheRaw(u64 unique_identifier, Maxwell::ShaderProgram program_type, + explicit ShaderDiskCacheRaw(u64 unique_identifier, ProgramType program_type, u32 program_code_size, u32 program_code_size_b, ProgramCode program_code, ProgramCode program_code_b); ShaderDiskCacheRaw(); @@ -141,30 +137,13 @@ public: } bool HasProgramA() const { - return program_type == Maxwell::ShaderProgram::VertexA; + return program_type == ProgramType::VertexA; } - Maxwell::ShaderProgram GetProgramType() const { + ProgramType GetProgramType() const { return program_type; } - Maxwell::ShaderStage GetProgramStage() const { - switch (program_type) { - case Maxwell::ShaderProgram::VertexA: - case Maxwell::ShaderProgram::VertexB: - return Maxwell::ShaderStage::Vertex; - case Maxwell::ShaderProgram::TesselationControl: - return Maxwell::ShaderStage::TesselationControl; - case Maxwell::ShaderProgram::TesselationEval: - return Maxwell::ShaderStage::TesselationEval; - case Maxwell::ShaderProgram::Geometry: - return Maxwell::ShaderStage::Geometry; - case Maxwell::ShaderProgram::Fragment: - return Maxwell::ShaderStage::Fragment; - } - UNREACHABLE(); - } - const ProgramCode& GetProgramCode() const { return program_code; } @@ -175,7 +154,7 @@ public: private: u64 unique_identifier{}; - Maxwell::ShaderProgram program_type{}; + ProgramType program_type{}; u32 program_code_size{}; u32 program_code_size_b{}; diff --git a/src/video_core/renderer_opengl/gl_shader_gen.cpp b/src/video_core/renderer_opengl/gl_shader_gen.cpp index f9ee8429e..3a8d9e1da 100644 --- a/src/video_core/renderer_opengl/gl_shader_gen.cpp +++ b/src/video_core/renderer_opengl/gl_shader_gen.cpp @@ -14,7 +14,8 @@ using Tegra::Engines::Maxwell3D; using VideoCommon::Shader::ProgramCode; using VideoCommon::Shader::ShaderIR; -static constexpr u32 PROGRAM_OFFSET{10}; +static constexpr u32 PROGRAM_OFFSET = 10; +static constexpr u32 COMPUTE_OFFSET = 0; ProgramResult GenerateVertexShader(const Device& device, const ShaderSetup& setup) { const std::string id = fmt::format("{:016x}", setup.program.unique_identifier); @@ -29,17 +30,15 @@ layout (std140, binding = EMULATION_UBO_BINDING) uniform vs_config { }; )"; - const ShaderIR program_ir(setup.program.code, PROGRAM_OFFSET, setup.program.size_a); - ProgramResult program = - Decompile(device, program_ir, Maxwell3D::Regs::ShaderStage::Vertex, "vertex"); + const ShaderIR program_ir(setup.program.code, PROGRAM_OFFSET, setup.program.size_a); + const auto stage = setup.IsDualProgram() ? ProgramType::VertexA : ProgramType::VertexB; + ProgramResult program = Decompile(device, program_ir, stage, "vertex"); out += program.first; if (setup.IsDualProgram()) { const ShaderIR program_ir_b(setup.program.code_b, PROGRAM_OFFSET, setup.program.size_b); - ProgramResult program_b = - Decompile(device, program_ir_b, Maxwell3D::Regs::ShaderStage::Vertex, "vertex_b"); - + ProgramResult program_b = Decompile(device, program_ir_b, ProgramType::VertexB, "vertex_b"); out += program_b.first; } @@ -80,9 +79,9 @@ layout (std140, binding = EMULATION_UBO_BINDING) uniform gs_config { }; )"; + const ShaderIR program_ir(setup.program.code, PROGRAM_OFFSET, setup.program.size_a); - ProgramResult program = - Decompile(device, program_ir, Maxwell3D::Regs::ShaderStage::Geometry, "geometry"); + ProgramResult program = Decompile(device, program_ir, ProgramType::Geometry, "geometry"); out += program.first; out += R"( @@ -116,9 +115,7 @@ layout (std140, binding = EMULATION_UBO_BINDING) uniform fs_config { )"; const ShaderIR program_ir(setup.program.code, PROGRAM_OFFSET, setup.program.size_a); - ProgramResult program = - Decompile(device, program_ir, Maxwell3D::Regs::ShaderStage::Fragment, "fragment"); - + ProgramResult program = Decompile(device, program_ir, ProgramType::Fragment, "fragment"); out += program.first; out += R"( @@ -130,4 +127,22 @@ void main() { return {std::move(out), std::move(program.second)}; } +ProgramResult GenerateComputeShader(const Device& device, const ShaderSetup& setup) { + const std::string id = fmt::format("{:016x}", setup.program.unique_identifier); + + std::string out = "// Shader Unique Id: CS" + id + "\n\n"; + out += GetCommonDeclarations(); + + const ShaderIR program_ir(setup.program.code, COMPUTE_OFFSET, setup.program.size_a); + ProgramResult program = Decompile(device, program_ir, ProgramType::Compute, "compute"); + out += program.first; + + out += R"( +void main() { + execute_compute(); +} +)"; + return {std::move(out), std::move(program.second)}; +} + } // namespace OpenGL::GLShader diff --git a/src/video_core/renderer_opengl/gl_shader_gen.h b/src/video_core/renderer_opengl/gl_shader_gen.h index 7cbc590f8..3833e88ab 100644 --- a/src/video_core/renderer_opengl/gl_shader_gen.h +++ b/src/video_core/renderer_opengl/gl_shader_gen.h @@ -54,4 +54,7 @@ ProgramResult GenerateGeometryShader(const Device& device, const ShaderSetup& se /// Generates the GLSL fragment shader program source code for the given FS program ProgramResult GenerateFragmentShader(const Device& device, const ShaderSetup& setup); +/// Generates the GLSL compute shader program source code for the given CS program +ProgramResult GenerateComputeShader(const Device& device, const ShaderSetup& setup); + } // namespace OpenGL::GLShader diff --git a/src/video_core/renderer_opengl/gl_shader_util.cpp b/src/video_core/renderer_opengl/gl_shader_util.cpp index 5f3fe067e..9e74eda0d 100644 --- a/src/video_core/renderer_opengl/gl_shader_util.cpp +++ b/src/video_core/renderer_opengl/gl_shader_util.cpp @@ -10,21 +10,25 @@ namespace OpenGL::GLShader { -GLuint LoadShader(const char* source, GLenum type) { - const char* debug_type; +namespace { +const char* GetStageDebugName(GLenum type) { switch (type) { case GL_VERTEX_SHADER: - debug_type = "vertex"; - break; + return "vertex"; case GL_GEOMETRY_SHADER: - debug_type = "geometry"; - break; + return "geometry"; case GL_FRAGMENT_SHADER: - debug_type = "fragment"; - break; - default: - UNREACHABLE(); + return "fragment"; + case GL_COMPUTE_SHADER: + return "compute"; } + UNIMPLEMENTED(); + return "unknown"; +} +} // Anonymous namespace + +GLuint LoadShader(const char* source, GLenum type) { + const char* debug_type = GetStageDebugName(type); const GLuint shader_id = glCreateShader(type); glShaderSource(shader_id, 1, &source, nullptr); LOG_DEBUG(Render_OpenGL, "Compiling {} shader...", debug_type); diff --git a/src/video_core/renderer_opengl/gl_state.cpp b/src/video_core/renderer_opengl/gl_state.cpp index 0eae98afe..f4777d0b0 100644 --- a/src/video_core/renderer_opengl/gl_state.cpp +++ b/src/video_core/renderer_opengl/gl_state.cpp @@ -165,6 +165,25 @@ OpenGLState::OpenGLState() { alpha_test.ref = 0.0f; } +void OpenGLState::SetDefaultViewports() { + for (auto& item : viewports) { + item.x = 0; + item.y = 0; + item.width = 0; + item.height = 0; + item.depth_range_near = 0.0f; + item.depth_range_far = 1.0f; + item.scissor.enabled = false; + item.scissor.x = 0; + item.scissor.y = 0; + item.scissor.width = 0; + item.scissor.height = 0; + } + + depth_clamp.far_plane = false; + depth_clamp.near_plane = false; +} + void OpenGLState::ApplyDefaultState() { glEnable(GL_BLEND); glDisable(GL_FRAMEBUFFER_SRGB); @@ -526,7 +545,7 @@ void OpenGLState::ApplySamplers() const { } } -void OpenGLState::Apply() const { +void OpenGLState::Apply() { MICROPROFILE_SCOPE(OpenGL_State); ApplyFramebufferState(); ApplyVertexArrayState(); @@ -536,19 +555,31 @@ void OpenGLState::Apply() const { ApplyPointSize(); ApplyFragmentColorClamp(); ApplyMultisample(); + if (dirty.color_mask) { + ApplyColorMask(); + dirty.color_mask = false; + } ApplyDepthClamp(); - ApplyColorMask(); ApplyViewport(); - ApplyStencilTest(); + if (dirty.stencil_state) { + ApplyStencilTest(); + dirty.stencil_state = false; + } ApplySRgb(); ApplyCulling(); ApplyDepth(); ApplyPrimitiveRestart(); - ApplyBlending(); + if (dirty.blend_state) { + ApplyBlending(); + dirty.blend_state = false; + } ApplyLogicOp(); ApplyTextures(); ApplySamplers(); - ApplyPolygonOffset(); + if (dirty.polygon_offset) { + ApplyPolygonOffset(); + dirty.polygon_offset = false; + } ApplyAlphaTest(); } diff --git a/src/video_core/renderer_opengl/gl_state.h b/src/video_core/renderer_opengl/gl_state.h index b0140495d..fdf9a8a12 100644 --- a/src/video_core/renderer_opengl/gl_state.h +++ b/src/video_core/renderer_opengl/gl_state.h @@ -195,8 +195,9 @@ public: s_rgb_used = false; } + void SetDefaultViewports(); /// Apply this state as the current OpenGL state - void Apply() const; + void Apply(); void ApplyFramebufferState() const; void ApplyVertexArrayState() const; @@ -237,11 +238,41 @@ public: /// Viewport does not affects glClearBuffer so emulate viewport using scissor test void EmulateViewportWithScissor(); + void MarkDirtyBlendState() { + dirty.blend_state = true; + } + + void MarkDirtyStencilState() { + dirty.stencil_state = true; + } + + void MarkDirtyPolygonOffset() { + dirty.polygon_offset = true; + } + + void MarkDirtyColorMask() { + dirty.color_mask = true; + } + + void AllDirty() { + dirty.blend_state = true; + dirty.stencil_state = true; + dirty.polygon_offset = true; + dirty.color_mask = true; + } + private: static OpenGLState cur_state; // Workaround for sRGB problems caused by QT not supporting srgb output static bool s_rgb_used; + struct { + bool blend_state; + bool stencil_state; + bool viewport_state; + bool polygon_offset; + bool color_mask; + } dirty{}; }; } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_texture_cache.cpp b/src/video_core/renderer_opengl/gl_texture_cache.cpp index b1f6bc7c2..408332f90 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.cpp +++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp @@ -137,7 +137,6 @@ constexpr std::array<FormatTuple, VideoCore::Surface::MaxPixelFormat> tex_format const FormatTuple& GetFormatTuple(PixelFormat pixel_format, ComponentType component_type) { ASSERT(static_cast<std::size_t>(pixel_format) < tex_format_tuples.size()); const auto& format{tex_format_tuples[static_cast<std::size_t>(pixel_format)]}; - ASSERT(component_type == format.component_type); return format; } @@ -485,11 +484,15 @@ void TextureCacheOpenGL::ImageBlit(View& src_view, View& dst_view, const auto& dst_params{dst_view->GetSurfaceParams()}; OpenGLState prev_state{OpenGLState::GetCurState()}; - SCOPE_EXIT({ prev_state.Apply(); }); + SCOPE_EXIT({ + prev_state.AllDirty(); + prev_state.Apply(); + }); OpenGLState state; state.draw.read_framebuffer = src_framebuffer.handle; state.draw.draw_framebuffer = dst_framebuffer.handle; + state.AllDirty(); state.Apply(); u32 buffers{}; diff --git a/src/video_core/renderer_opengl/renderer_opengl.cpp b/src/video_core/renderer_opengl/renderer_opengl.cpp index 9ecdddb0d..a05cef3b9 100644 --- a/src/video_core/renderer_opengl/renderer_opengl.cpp +++ b/src/video_core/renderer_opengl/renderer_opengl.cpp @@ -108,6 +108,7 @@ void RendererOpenGL::SwapBuffers( // Maintain the rasterizer's state as a priority OpenGLState prev_state = OpenGLState::GetCurState(); + state.AllDirty(); state.Apply(); if (framebuffer) { @@ -140,6 +141,7 @@ void RendererOpenGL::SwapBuffers( system.GetPerfStats().BeginSystemFrame(); // Restore the rasterizer state + prev_state.AllDirty(); prev_state.Apply(); } @@ -206,6 +208,7 @@ void RendererOpenGL::InitOpenGLObjects() { // Link shaders and get variable locations shader.CreateFromSource(vertex_shader, nullptr, fragment_shader); state.draw.shader_program = shader.handle; + state.AllDirty(); state.Apply(); uniform_modelview_matrix = glGetUniformLocation(shader.handle, "modelview_matrix"); uniform_color_texture = glGetUniformLocation(shader.handle, "color_texture"); @@ -338,12 +341,14 @@ void RendererOpenGL::DrawScreenTriangles(const ScreenInfo& screen_info, float x, // Workaround brigthness problems in SMO by enabling sRGB in the final output // if it has been used in the frame. Needed because of this bug in QT: QTBUG-50987 state.framebuffer_srgb.enabled = OpenGLState::GetsRGBUsed(); + state.AllDirty(); state.Apply(); glNamedBufferSubData(vertex_buffer.handle, 0, sizeof(vertices), vertices.data()); glDrawArrays(GL_TRIANGLE_STRIP, 0, 4); // Restore default state state.framebuffer_srgb.enabled = false; state.texture_units[0].texture = 0; + state.AllDirty(); state.Apply(); // Clear sRGB state for the next frame OpenGLState::ClearsRGBUsed(); @@ -388,6 +393,7 @@ void RendererOpenGL::CaptureScreenshot() { GLuint old_read_fb = state.draw.read_framebuffer; GLuint old_draw_fb = state.draw.draw_framebuffer; state.draw.read_framebuffer = state.draw.draw_framebuffer = screenshot_framebuffer.handle; + state.AllDirty(); state.Apply(); Layout::FramebufferLayout layout{renderer_settings.screenshot_framebuffer_layout}; @@ -407,6 +413,7 @@ void RendererOpenGL::CaptureScreenshot() { screenshot_framebuffer.Release(); state.draw.read_framebuffer = old_read_fb; state.draw.draw_framebuffer = old_draw_fb; + state.AllDirty(); state.Apply(); glDeleteRenderbuffers(1, &renderbuffer); diff --git a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp index 9b2d8e987..d267712c9 100644 --- a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp +++ b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp @@ -205,10 +205,6 @@ public: } private: - using OperationDecompilerFn = Id (SPIRVDecompiler::*)(Operation); - using OperationDecompilersArray = - std::array<OperationDecompilerFn, static_cast<std::size_t>(OperationCode::Amount)>; - static constexpr auto INTERNAL_FLAGS_COUNT = static_cast<std::size_t>(InternalFlag::Amount); void AllocateBindings() { @@ -804,12 +800,7 @@ private: return {}; } - Id LogicalAll2(Operation operation) { - UNIMPLEMENTED(); - return {}; - } - - Id LogicalAny2(Operation operation) { + Id LogicalAnd2(Operation operation) { UNIMPLEMENTED(); return {}; } @@ -1206,7 +1197,7 @@ private: return {}; } - static constexpr OperationDecompilersArray operation_decompilers = { + static constexpr std::array operation_decompilers = { &SPIRVDecompiler::Assign, &SPIRVDecompiler::Ternary<&Module::OpSelect, Type::Float, Type::Bool, Type::Float, @@ -1291,8 +1282,7 @@ private: &SPIRVDecompiler::Binary<&Module::OpLogicalNotEqual, Type::Bool>, &SPIRVDecompiler::Unary<&Module::OpLogicalNot, Type::Bool>, &SPIRVDecompiler::LogicalPick2, - &SPIRVDecompiler::LogicalAll2, - &SPIRVDecompiler::LogicalAny2, + &SPIRVDecompiler::LogicalAnd2, &SPIRVDecompiler::Binary<&Module::OpFOrdLessThan, Type::Bool, Type::Float>, &SPIRVDecompiler::Binary<&Module::OpFOrdEqual, Type::Bool, Type::Float>, @@ -1357,6 +1347,7 @@ private: &SPIRVDecompiler::WorkGroupId<1>, &SPIRVDecompiler::WorkGroupId<2>, }; + static_assert(operation_decompilers.size() == static_cast<std::size_t>(OperationCode::Amount)); const VKDevice& device; const ShaderIR& ir; diff --git a/src/video_core/shader/decode/arithmetic.cpp b/src/video_core/shader/decode/arithmetic.cpp index 87d8fecaa..1473c282a 100644 --- a/src/video_core/shader/decode/arithmetic.cpp +++ b/src/video_core/shader/decode/arithmetic.cpp @@ -42,11 +42,14 @@ u32 ShaderIR::DecodeArithmetic(NodeBlock& bb, u32 pc) { case OpCode::Id::FMUL_R: case OpCode::Id::FMUL_IMM: { // FMUL does not have 'abs' bits and only the second operand has a 'neg' bit. - UNIMPLEMENTED_IF_MSG(instr.fmul.tab5cb8_2 != 0, "FMUL tab5cb8_2({}) is not implemented", - instr.fmul.tab5cb8_2.Value()); - UNIMPLEMENTED_IF_MSG( - instr.fmul.tab5c68_0 != 1, "FMUL tab5cb8_0({}) is not implemented", - instr.fmul.tab5c68_0.Value()); // SMO typical sends 1 here which seems to be the default + if (instr.fmul.tab5cb8_2 != 0) { + LOG_WARNING(HW_GPU, "FMUL tab5cb8_2({}) is not implemented", + instr.fmul.tab5cb8_2.Value()); + } + if (instr.fmul.tab5c68_0 != 1) { + LOG_WARNING(HW_GPU, "FMUL tab5cb8_0({}) is not implemented", + instr.fmul.tab5c68_0.Value()); + } op_b = GetOperandAbsNegFloat(op_b, false, instr.fmul.negate_b); diff --git a/src/video_core/shader/decode/arithmetic_half_immediate.cpp b/src/video_core/shader/decode/arithmetic_half_immediate.cpp index 7bcf38f23..6466fc011 100644 --- a/src/video_core/shader/decode/arithmetic_half_immediate.cpp +++ b/src/video_core/shader/decode/arithmetic_half_immediate.cpp @@ -23,7 +23,9 @@ u32 ShaderIR::DecodeArithmeticHalfImmediate(NodeBlock& bb, u32 pc) { LOG_WARNING(HW_GPU, "{} FTZ not implemented", opcode->get().GetName()); } } else { - UNIMPLEMENTED_IF(instr.alu_half_imm.precision != Tegra::Shader::HalfPrecision::None); + if (instr.alu_half_imm.precision != Tegra::Shader::HalfPrecision::None) { + LOG_WARNING(HW_GPU, "{} FTZ not implemented", opcode->get().GetName()); + } } Node op_a = UnpackHalfFloat(GetRegister(instr.gpr8), instr.alu_half_imm.type_a); diff --git a/src/video_core/shader/decode/ffma.cpp b/src/video_core/shader/decode/ffma.cpp index 29be25ca3..ca2f39e8d 100644 --- a/src/video_core/shader/decode/ffma.cpp +++ b/src/video_core/shader/decode/ffma.cpp @@ -18,10 +18,12 @@ u32 ShaderIR::DecodeFfma(NodeBlock& bb, u32 pc) { const auto opcode = OpCode::Decode(instr); UNIMPLEMENTED_IF_MSG(instr.ffma.cc != 0, "FFMA cc not implemented"); - UNIMPLEMENTED_IF_MSG(instr.ffma.tab5980_0 != 1, "FFMA tab5980_0({}) not implemented", - instr.ffma.tab5980_0.Value()); // Seems to be 1 by default based on SMO - UNIMPLEMENTED_IF_MSG(instr.ffma.tab5980_1 != 0, "FFMA tab5980_1({}) not implemented", - instr.ffma.tab5980_1.Value()); + if (instr.ffma.tab5980_0 != 1) { + LOG_WARNING(HW_GPU, "FFMA tab5980_0({}) not implemented", instr.ffma.tab5980_0.Value()); + } + if (instr.ffma.tab5980_1 != 0) { + LOG_WARNING(HW_GPU, "FFMA tab5980_1({}) not implemented", instr.ffma.tab5980_1.Value()); + } const Node op_a = GetRegister(instr.gpr8); diff --git a/src/video_core/shader/decode/half_set_predicate.cpp b/src/video_core/shader/decode/half_set_predicate.cpp index d59d15bd8..a82a6a15c 100644 --- a/src/video_core/shader/decode/half_set_predicate.cpp +++ b/src/video_core/shader/decode/half_set_predicate.cpp @@ -18,43 +18,56 @@ u32 ShaderIR::DecodeHalfSetPredicate(NodeBlock& bb, u32 pc) { const Instruction instr = {program_code[pc]}; const auto opcode = OpCode::Decode(instr); - UNIMPLEMENTED_IF(instr.hsetp2.ftz != 0); + DEBUG_ASSERT(instr.hsetp2.ftz == 0); Node op_a = UnpackHalfFloat(GetRegister(instr.gpr8), instr.hsetp2.type_a); op_a = GetOperandAbsNegHalf(op_a, instr.hsetp2.abs_a, instr.hsetp2.negate_a); - Node op_b = [&]() { - switch (opcode->get().GetId()) { - case OpCode::Id::HSETP2_R: - return GetOperandAbsNegHalf(GetRegister(instr.gpr20), instr.hsetp2.abs_a, - instr.hsetp2.negate_b); - default: - UNREACHABLE(); - return Immediate(0); - } - }(); - op_b = UnpackHalfFloat(op_b, instr.hsetp2.type_b); - - // We can't use the constant predicate as destination. - ASSERT(instr.hsetp2.pred3 != static_cast<u64>(Pred::UnusedIndex)); - - const Node second_pred = GetPredicate(instr.hsetp2.pred39, instr.hsetp2.neg_pred != 0); + Tegra::Shader::PredCondition cond{}; + bool h_and{}; + Node op_b{}; + switch (opcode->get().GetId()) { + case OpCode::Id::HSETP2_C: + cond = instr.hsetp2.cbuf_and_imm.cond; + h_and = instr.hsetp2.cbuf_and_imm.h_and; + op_b = GetOperandAbsNegHalf(GetConstBuffer(instr.cbuf34.index, instr.cbuf34.offset), + instr.hsetp2.cbuf.abs_b, instr.hsetp2.cbuf.negate_b); + break; + case OpCode::Id::HSETP2_IMM: + cond = instr.hsetp2.cbuf_and_imm.cond; + h_and = instr.hsetp2.cbuf_and_imm.h_and; + op_b = UnpackHalfImmediate(instr, true); + break; + case OpCode::Id::HSETP2_R: + cond = instr.hsetp2.reg.cond; + h_and = instr.hsetp2.reg.h_and; + op_b = + UnpackHalfFloat(GetOperandAbsNegHalf(GetRegister(instr.gpr20), instr.hsetp2.reg.abs_b, + instr.hsetp2.reg.negate_b), + instr.hsetp2.reg.type_b); + break; + default: + UNREACHABLE(); + op_b = Immediate(0); + } const OperationCode combiner = GetPredicateCombiner(instr.hsetp2.op); - const OperationCode pair_combiner = - instr.hsetp2.h_and ? OperationCode::LogicalAll2 : OperationCode::LogicalAny2; - - const Node comparison = GetPredicateComparisonHalf(instr.hsetp2.cond, op_a, op_b); - const Node first_pred = Operation(pair_combiner, comparison); + const Node pred39 = GetPredicate(instr.hsetp2.pred39, instr.hsetp2.neg_pred); - // Set the primary predicate to the result of Predicate OP SecondPredicate - const Node value = Operation(combiner, first_pred, second_pred); - SetPredicate(bb, instr.hsetp2.pred3, value); + const auto Write = [&](u64 dest, Node src) { + SetPredicate(bb, dest, Operation(combiner, std::move(src), pred39)); + }; - if (instr.hsetp2.pred0 != static_cast<u64>(Pred::UnusedIndex)) { - // Set the secondary predicate to the result of !Predicate OP SecondPredicate, if enabled - const Node negated_pred = Operation(OperationCode::LogicalNegate, first_pred); - SetPredicate(bb, instr.hsetp2.pred0, Operation(combiner, negated_pred, second_pred)); + const Node comparison = GetPredicateComparisonHalf(cond, op_a, op_b); + const u64 first = instr.hsetp2.pred0; + const u64 second = instr.hsetp2.pred3; + if (h_and) { + const Node joined = Operation(OperationCode::LogicalAnd2, comparison); + Write(first, joined); + Write(second, Operation(OperationCode::LogicalNegate, joined)); + } else { + Write(first, Operation(OperationCode::LogicalPick2, comparison, Immediate(0u))); + Write(second, Operation(OperationCode::LogicalPick2, comparison, Immediate(1u))); } return pc; diff --git a/src/video_core/shader/decode/hfma2.cpp b/src/video_core/shader/decode/hfma2.cpp index c3bcf1ae9..5b44cb79c 100644 --- a/src/video_core/shader/decode/hfma2.cpp +++ b/src/video_core/shader/decode/hfma2.cpp @@ -22,9 +22,9 @@ u32 ShaderIR::DecodeHfma2(NodeBlock& bb, u32 pc) { const auto opcode = OpCode::Decode(instr); if (opcode->get().GetId() == OpCode::Id::HFMA2_RR) { - UNIMPLEMENTED_IF(instr.hfma2.rr.precision != HalfPrecision::None); + DEBUG_ASSERT(instr.hfma2.rr.precision == HalfPrecision::None); } else { - UNIMPLEMENTED_IF(instr.hfma2.precision != HalfPrecision::None); + DEBUG_ASSERT(instr.hfma2.precision == HalfPrecision::None); } constexpr auto identity = HalfType::H0_H1; diff --git a/src/video_core/shader/node.h b/src/video_core/shader/node.h index 7427ed896..715184d67 100644 --- a/src/video_core/shader/node.h +++ b/src/video_core/shader/node.h @@ -101,8 +101,7 @@ enum class OperationCode { LogicalXor, /// (bool a, bool b) -> bool LogicalNegate, /// (bool a) -> bool LogicalPick2, /// (bool2 pair, uint index) -> bool - LogicalAll2, /// (bool2 a) -> bool - LogicalAny2, /// (bool2 a) -> bool + LogicalAnd2, /// (bool2 a) -> bool LogicalFLessThan, /// (float a, float b) -> bool LogicalFEqual, /// (float a, float b) -> bool diff --git a/src/video_core/shader/track.cpp b/src/video_core/shader/track.cpp index a53e02253..55f5949e4 100644 --- a/src/video_core/shader/track.cpp +++ b/src/video_core/shader/track.cpp @@ -59,8 +59,8 @@ std::tuple<Node, u32, u32> ShaderIR::TrackCbuf(Node tracked, const NodeBlock& co return TrackCbuf(source, code, new_cursor); } if (const auto operation = std::get_if<OperationNode>(&*tracked)) { - for (std::size_t i = 0; i < operation->GetOperandsCount(); ++i) { - if (auto found = TrackCbuf((*operation)[i], code, cursor); std::get<0>(found)) { + for (std::size_t i = operation->GetOperandsCount(); i > 0; --i) { + if (auto found = TrackCbuf((*operation)[i - 1], code, cursor); std::get<0>(found)) { // Cbuf found in operand. return found; } diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index 7f9623c62..a3a3770a7 100644 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -116,10 +116,10 @@ public: std::lock_guard lock{mutex}; auto& maxwell3d = system.GPU().Maxwell3D(); - if (!maxwell3d.dirty_flags.zeta_buffer) { + if (!maxwell3d.dirty.depth_buffer) { return depth_buffer.view; } - maxwell3d.dirty_flags.zeta_buffer = false; + maxwell3d.dirty.depth_buffer = false; const auto& regs{maxwell3d.regs}; const auto gpu_addr{regs.zeta.Address()}; @@ -145,10 +145,10 @@ public: std::lock_guard lock{mutex}; ASSERT(index < Tegra::Engines::Maxwell3D::Regs::NumRenderTargets); auto& maxwell3d = system.GPU().Maxwell3D(); - if (!maxwell3d.dirty_flags.color_buffer[index]) { + if (!maxwell3d.dirty.render_target[index]) { return render_targets[index].view; } - maxwell3d.dirty_flags.color_buffer.reset(index); + maxwell3d.dirty.render_target[index] = false; const auto& regs{maxwell3d.regs}; if (index >= regs.rt_control.count || regs.rt[index].Address() == 0 || @@ -274,10 +274,11 @@ protected: auto& maxwell3d = system.GPU().Maxwell3D(); const u32 index = surface->GetRenderTarget(); if (index == DEPTH_RT) { - maxwell3d.dirty_flags.zeta_buffer = true; + maxwell3d.dirty.depth_buffer = true; } else { - maxwell3d.dirty_flags.color_buffer.set(index, true); + maxwell3d.dirty.render_target[index] = true; } + maxwell3d.dirty.render_settings = true; } void Register(TSurface surface) { diff --git a/src/yuzu/main.cpp b/src/yuzu/main.cpp index ae21f4753..381644694 100644 --- a/src/yuzu/main.cpp +++ b/src/yuzu/main.cpp @@ -1843,13 +1843,14 @@ void GMainWindow::OnCoreError(Core::System::ResultStatus result, std::string det "data, or other bugs."); switch (result) { case Core::System::ResultStatus::ErrorSystemFiles: { - QString message = tr("yuzu was unable to locate a Switch system archive"); - if (!details.empty()) { - message.append(tr(": %1. ").arg(QString::fromStdString(details))); + QString message; + if (details.empty()) { + message = + tr("yuzu was unable to locate a Switch system archive. %1").arg(common_message); } else { - message.append(tr(". ")); + message = tr("yuzu was unable to locate a Switch system archive: %1. %2") + .arg(QString::fromStdString(details), common_message); } - message.append(common_message); answer = QMessageBox::question(this, tr("System Archive Not Found"), message, QMessageBox::Yes | QMessageBox::No, QMessageBox::No); @@ -1858,8 +1859,8 @@ void GMainWindow::OnCoreError(Core::System::ResultStatus result, std::string det } case Core::System::ResultStatus::ErrorSharedFont: { - QString message = tr("yuzu was unable to locate the Switch shared fonts. "); - message.append(common_message); + const QString message = + tr("yuzu was unable to locate the Switch shared fonts. %1").arg(common_message); answer = QMessageBox::question(this, tr("Shared Fonts Not Found"), message, QMessageBox::Yes | QMessageBox::No, QMessageBox::No); status_message = tr("Shared Font Missing"); |