diff options
Diffstat (limited to 'src/video_core')
-rw-r--r-- | src/video_core/engines/maxwell_3d.cpp | 91 | ||||
-rw-r--r-- | src/video_core/engines/maxwell_3d.h | 29 | ||||
-rw-r--r-- | src/video_core/engines/shader_bytecode.h | 20 | ||||
-rw-r--r-- | src/video_core/gpu.cpp | 18 | ||||
-rw-r--r-- | src/video_core/gpu.h | 2 | ||||
-rw-r--r-- | src/video_core/gpu_thread.h | 2 | ||||
-rw-r--r-- | src/video_core/memory_manager.cpp | 14 | ||||
-rw-r--r-- | src/video_core/memory_manager.h | 7 | ||||
-rw-r--r-- | src/video_core/renderer_opengl/gl_rasterizer.cpp | 128 | ||||
-rw-r--r-- | src/video_core/renderer_opengl/gl_rasterizer.h | 7 | ||||
-rw-r--r-- | src/video_core/renderer_opengl/gl_state.cpp | 1 | ||||
-rw-r--r-- | src/video_core/renderer_opengl/gl_state.h | 1 | ||||
-rw-r--r-- | src/video_core/renderer_opengl/maxwell_to_gl.h | 20 | ||||
-rw-r--r-- | src/video_core/shader/decode/shift.cpp | 113 |
14 files changed, 257 insertions, 196 deletions
diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp index 7cea146f0..0b3e8749b 100644 --- a/src/video_core/engines/maxwell_3d.cpp +++ b/src/video_core/engines/maxwell_3d.cpp @@ -9,6 +9,7 @@ #include "core/core_timing.h" #include "video_core/engines/maxwell_3d.h" #include "video_core/engines/shader_type.h" +#include "video_core/gpu.h" #include "video_core/memory_manager.h" #include "video_core/rasterizer_interface.h" #include "video_core/textures/texture.h" @@ -519,61 +520,63 @@ void Maxwell3D::ProcessFirmwareCall4() { regs.reg_array[0xd00] = 1; } -void Maxwell3D::ProcessQueryGet() { +void Maxwell3D::StampQueryResult(u64 payload, bool long_query) { + struct LongQueryResult { + u64_le value; + u64_le timestamp; + }; + static_assert(sizeof(LongQueryResult) == 16, "LongQueryResult has wrong size"); const GPUVAddr sequence_address{regs.query.QueryAddress()}; - // Since the sequence address is given as a GPU VAddr, we have to convert it to an application - // VAddr before writing. + if (long_query) { + // Write the 128-bit result structure in long mode. Note: We emulate an infinitely fast + // GPU, this command may actually take a while to complete in real hardware due to GPU + // wait queues. + LongQueryResult query_result{payload, system.GPU().GetTicks()}; + memory_manager.WriteBlock(sequence_address, &query_result, sizeof(query_result)); + } else { + memory_manager.Write<u32>(sequence_address, static_cast<u32>(payload)); + } +} +void Maxwell3D::ProcessQueryGet() { // TODO(Subv): Support the other query units. ASSERT_MSG(regs.query.query_get.unit == Regs::QueryUnit::Crop, "Units other than CROP are unimplemented"); - u64 result = 0; - - // TODO(Subv): Support the other query variables - switch (regs.query.query_get.select) { - case Regs::QuerySelect::Zero: - // This seems to actually write the query sequence to the query address. - result = regs.query.query_sequence; + switch (regs.query.query_get.operation) { + case Regs::QueryOperation::Release: { + const u64 result = regs.query.query_sequence; + StampQueryResult(result, regs.query.query_get.short_query == 0); break; - default: - result = 1; - UNIMPLEMENTED_MSG("Unimplemented query select type {}", - static_cast<u32>(regs.query.query_get.select.Value())); } - - // TODO(Subv): Research and implement how query sync conditions work. - - struct LongQueryResult { - u64_le value; - u64_le timestamp; - }; - static_assert(sizeof(LongQueryResult) == 16, "LongQueryResult has wrong size"); - - switch (regs.query.query_get.mode) { - case Regs::QueryMode::Write: - case Regs::QueryMode::Write2: { - u32 sequence = regs.query.query_sequence; - if (regs.query.query_get.short_query) { - // Write the current query sequence to the sequence address. - // TODO(Subv): Find out what happens if you use a long query type but mark it as a short - // query. - memory_manager.Write<u32>(sequence_address, sequence); - } else { - // Write the 128-bit result structure in long mode. Note: We emulate an infinitely fast - // GPU, this command may actually take a while to complete in real hardware due to GPU - // wait queues. - LongQueryResult query_result{}; - query_result.value = result; - // TODO(Subv): Generate a real GPU timestamp and write it here instead of CoreTiming - query_result.timestamp = system.CoreTiming().GetTicks(); - memory_manager.WriteBlock(sequence_address, &query_result, sizeof(query_result)); + case Regs::QueryOperation::Acquire: { + // Todo(Blinkhawk): Under this operation, the GPU waits for the CPU + // to write a value that matches the current payload. + UNIMPLEMENTED_MSG("Unimplemented query operation ACQUIRE"); + break; + } + case Regs::QueryOperation::Counter: { + u64 result{}; + switch (regs.query.query_get.select) { + case Regs::QuerySelect::Zero: + result = 0; + break; + default: + result = 1; + UNIMPLEMENTED_MSG("Unimplemented query select type {}", + static_cast<u32>(regs.query.query_get.select.Value())); } + StampQueryResult(result, regs.query.query_get.short_query == 0); + break; + } + case Regs::QueryOperation::Trap: { + UNIMPLEMENTED_MSG("Unimplemented query operation TRAP"); + break; + } + default: { + UNIMPLEMENTED_MSG("Unknown query operation"); break; } - default: - UNIMPLEMENTED_MSG("Query mode {} not implemented", - static_cast<u32>(regs.query.query_get.mode.Value())); } } diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h index 8808bbf76..0a2af54e5 100644 --- a/src/video_core/engines/maxwell_3d.h +++ b/src/video_core/engines/maxwell_3d.h @@ -71,12 +71,11 @@ public: static constexpr std::size_t MaxConstBuffers = 18; static constexpr std::size_t MaxConstBufferSize = 0x10000; - enum class QueryMode : u32 { - Write = 0, - Sync = 1, - // TODO(Subv): It is currently unknown what the difference between method 2 and method 0 - // is. - Write2 = 2, + enum class QueryOperation : u32 { + Release = 0, + Acquire = 1, + Counter = 2, + Trap = 3, }; enum class QueryUnit : u32 { @@ -704,8 +703,8 @@ public: INSERT_UNION_PADDING_WORDS(0x15); s32 stencil_back_func_ref; - u32 stencil_back_func_mask; u32 stencil_back_mask; + u32 stencil_back_func_mask; INSERT_UNION_PADDING_WORDS(0xC); @@ -862,7 +861,11 @@ public: float point_size; - INSERT_UNION_PADDING_WORDS(0x7); + INSERT_UNION_PADDING_WORDS(0x1); + + u32 point_sprite_enable; + + INSERT_UNION_PADDING_WORDS(0x5); u32 zeta_enable; @@ -1077,7 +1080,7 @@ public: u32 query_sequence; union { u32 raw; - BitField<0, 2, QueryMode> mode; + BitField<0, 2, QueryOperation> operation; BitField<4, 1, u32> fence; BitField<12, 4, QueryUnit> unit; BitField<16, 1, QuerySyncCondition> sync_cond; @@ -1409,6 +1412,9 @@ private: /// Handles a write to the QUERY_GET register. void ProcessQueryGet(); + // Writes the query result accordingly + void StampQueryResult(u64 payload, bool long_query); + // Handles Conditional Rendering void ProcessQueryCondition(); @@ -1458,8 +1464,8 @@ ASSERT_REG_POSITION(polygon_offset_fill_enable, 0x372); ASSERT_REG_POSITION(patch_vertices, 0x373); ASSERT_REG_POSITION(scissor_test, 0x380); ASSERT_REG_POSITION(stencil_back_func_ref, 0x3D5); -ASSERT_REG_POSITION(stencil_back_func_mask, 0x3D6); -ASSERT_REG_POSITION(stencil_back_mask, 0x3D7); +ASSERT_REG_POSITION(stencil_back_mask, 0x3D6); +ASSERT_REG_POSITION(stencil_back_func_mask, 0x3D7); ASSERT_REG_POSITION(color_mask_common, 0x3E4); ASSERT_REG_POSITION(rt_separate_frag_data, 0x3EB); ASSERT_REG_POSITION(depth_bounds, 0x3E7); @@ -1494,6 +1500,7 @@ ASSERT_REG_POSITION(vb_element_base, 0x50D); ASSERT_REG_POSITION(vb_base_instance, 0x50E); ASSERT_REG_POSITION(clip_distance_enabled, 0x544); ASSERT_REG_POSITION(point_size, 0x546); +ASSERT_REG_POSITION(point_sprite_enable, 0x548); ASSERT_REG_POSITION(zeta_enable, 0x54E); ASSERT_REG_POSITION(multisample_control, 0x54F); ASSERT_REG_POSITION(condition, 0x554); diff --git a/src/video_core/engines/shader_bytecode.h b/src/video_core/engines/shader_bytecode.h index 955f981bd..c9bc83cd7 100644 --- a/src/video_core/engines/shader_bytecode.h +++ b/src/video_core/engines/shader_bytecode.h @@ -624,6 +624,19 @@ enum class ShuffleOperation : u64 { Bfly = 3, // shuffleXorNV }; +enum class ShfType : u64 { + Bits32 = 0, + U64 = 2, + S64 = 3, +}; + +enum class ShfXmode : u64 { + None = 0, + HI = 1, + X = 2, + XHI = 3, +}; + union Instruction { constexpr Instruction& operator=(const Instruction& instr) { value = instr.value; @@ -776,6 +789,13 @@ union Instruction { } shr; union { + BitField<37, 2, ShfType> type; + BitField<48, 2, ShfXmode> xmode; + BitField<50, 1, u64> wrap; + BitField<20, 6, u64> immediate; + } shf; + + union { BitField<39, 5, u64> shift_amount; BitField<48, 1, u64> negate_b; BitField<49, 1, u64> negate_a; diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp index b9c5c41a2..4419ab735 100644 --- a/src/video_core/gpu.cpp +++ b/src/video_core/gpu.cpp @@ -6,6 +6,7 @@ #include "common/microprofile.h" #include "core/core.h" #include "core/core_timing.h" +#include "core/core_timing_util.h" #include "core/memory.h" #include "video_core/engines/fermi_2d.h" #include "video_core/engines/kepler_compute.h" @@ -23,7 +24,7 @@ MICROPROFILE_DEFINE(GPU_wait, "GPU", "Wait for the GPU", MP_RGB(128, 128, 192)); GPU::GPU(Core::System& system, VideoCore::RendererBase& renderer, bool is_async) : system{system}, renderer{renderer}, is_async{is_async} { auto& rasterizer{renderer.Rasterizer()}; - memory_manager = std::make_unique<Tegra::MemoryManager>(system, rasterizer); + memory_manager = std::make_unique<Tegra::MemoryManager>(system); dma_pusher = std::make_unique<Tegra::DmaPusher>(*this); maxwell_3d = std::make_unique<Engines::Maxwell3D>(system, rasterizer, *memory_manager); fermi_2d = std::make_unique<Engines::Fermi2D>(rasterizer); @@ -122,6 +123,19 @@ bool GPU::CancelSyncptInterrupt(const u32 syncpoint_id, const u32 value) { return true; } +u64 GPU::GetTicks() const { + // This values were reversed engineered by fincs from NVN + // The gpu clock is reported in units of 385/625 nanoseconds + constexpr u64 gpu_ticks_num = 384; + constexpr u64 gpu_ticks_den = 625; + + const u64 cpu_ticks = system.CoreTiming().GetTicks(); + const u64 nanoseconds = Core::Timing::CyclesToNs(cpu_ticks).count(); + const u64 nanoseconds_num = nanoseconds / gpu_ticks_den; + const u64 nanoseconds_rem = nanoseconds % gpu_ticks_den; + return nanoseconds_num * gpu_ticks_num + (nanoseconds_rem * gpu_ticks_num) / gpu_ticks_den; +} + void GPU::FlushCommands() { renderer.Rasterizer().FlushCommands(); } @@ -340,7 +354,7 @@ void GPU::ProcessSemaphoreTriggerMethod() { block.sequence = regs.semaphore_sequence; // TODO(Kmather73): Generate a real GPU timestamp and write it here instead of // CoreTiming - block.timestamp = system.CoreTiming().GetTicks(); + block.timestamp = GetTicks(); memory_manager->WriteBlock(regs.semaphore_address.SemaphoreAddress(), &block, sizeof(block)); } else { diff --git a/src/video_core/gpu.h b/src/video_core/gpu.h index b648317bb..07727210c 100644 --- a/src/video_core/gpu.h +++ b/src/video_core/gpu.h @@ -192,6 +192,8 @@ public: bool CancelSyncptInterrupt(u32 syncpoint_id, u32 value); + u64 GetTicks() const; + std::unique_lock<std::mutex> LockSync() { return std::unique_lock{sync_mutex}; } diff --git a/src/video_core/gpu_thread.h b/src/video_core/gpu_thread.h index 08dc96bb3..882e2d9c7 100644 --- a/src/video_core/gpu_thread.h +++ b/src/video_core/gpu_thread.h @@ -86,7 +86,7 @@ struct CommandDataContainer { struct SynchState final { std::atomic_bool is_running{true}; - using CommandQueue = Common::SPSCQueue<CommandDataContainer>; + using CommandQueue = Common::MPSCQueue<CommandDataContainer>; CommandQueue queue; u64 last_fence{}; std::atomic<u64> signaled_fence{}; diff --git a/src/video_core/memory_manager.cpp b/src/video_core/memory_manager.cpp index 11848fbce..f1d50be3e 100644 --- a/src/video_core/memory_manager.cpp +++ b/src/video_core/memory_manager.cpp @@ -9,13 +9,12 @@ #include "core/hle/kernel/process.h" #include "core/hle/kernel/vm_manager.h" #include "core/memory.h" +#include "video_core/gpu.h" #include "video_core/memory_manager.h" -#include "video_core/rasterizer_interface.h" namespace Tegra { -MemoryManager::MemoryManager(Core::System& system, VideoCore::RasterizerInterface& rasterizer) - : rasterizer{rasterizer}, system{system} { +MemoryManager::MemoryManager(Core::System& system) : system{system} { std::fill(page_table.pointers.begin(), page_table.pointers.end(), nullptr); std::fill(page_table.attributes.begin(), page_table.attributes.end(), Common::PageType::Unmapped); @@ -84,7 +83,8 @@ GPUVAddr MemoryManager::UnmapBuffer(GPUVAddr gpu_addr, u64 size) { const auto cpu_addr = GpuToCpuAddress(gpu_addr); ASSERT(cpu_addr); - rasterizer.FlushAndInvalidateRegion(cache_addr, aligned_size); + system.GPU().FlushAndInvalidateRegion(cache_addr, aligned_size); + UnmapRange(gpu_addr, aligned_size); ASSERT(system.CurrentProcess() ->VMManager() @@ -242,7 +242,7 @@ void MemoryManager::ReadBlock(GPUVAddr src_addr, void* dest_buffer, const std::s switch (page_table.attributes[page_index]) { case Common::PageType::Memory: { const u8* src_ptr{page_table.pointers[page_index] + page_offset}; - rasterizer.FlushRegion(ToCacheAddr(src_ptr), copy_amount); + system.GPU().FlushRegion(ToCacheAddr(src_ptr), copy_amount); std::memcpy(dest_buffer, src_ptr, copy_amount); break; } @@ -292,7 +292,7 @@ void MemoryManager::WriteBlock(GPUVAddr dest_addr, const void* src_buffer, const switch (page_table.attributes[page_index]) { case Common::PageType::Memory: { u8* dest_ptr{page_table.pointers[page_index] + page_offset}; - rasterizer.InvalidateRegion(ToCacheAddr(dest_ptr), copy_amount); + system.GPU().InvalidateRegion(ToCacheAddr(dest_ptr), copy_amount); std::memcpy(dest_ptr, src_buffer, copy_amount); break; } @@ -340,7 +340,7 @@ void MemoryManager::CopyBlock(GPUVAddr dest_addr, GPUVAddr src_addr, const std:: switch (page_table.attributes[page_index]) { case Common::PageType::Memory: { const u8* src_ptr{page_table.pointers[page_index] + page_offset}; - rasterizer.FlushRegion(ToCacheAddr(src_ptr), copy_amount); + system.GPU().FlushRegion(ToCacheAddr(src_ptr), copy_amount); WriteBlock(dest_addr, src_ptr, copy_amount); break; } diff --git a/src/video_core/memory_manager.h b/src/video_core/memory_manager.h index aea010087..393447eb4 100644 --- a/src/video_core/memory_manager.h +++ b/src/video_core/memory_manager.h @@ -10,10 +10,6 @@ #include "common/common_types.h" #include "common/page_table.h" -namespace VideoCore { -class RasterizerInterface; -} - namespace Core { class System; } @@ -51,7 +47,7 @@ struct VirtualMemoryArea { class MemoryManager final { public: - explicit MemoryManager(Core::System& system, VideoCore::RasterizerInterface& rasterizer); + explicit MemoryManager(Core::System& system); ~MemoryManager(); GPUVAddr AllocateSpace(u64 size, u64 align); @@ -176,7 +172,6 @@ private: Common::PageTable page_table{page_bits}; VMAMap vma_map; - VideoCore::RasterizerInterface& rasterizer; Core::System& system; }; diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp index 362942e09..b0eb14c8b 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp @@ -248,9 +248,6 @@ void RasterizerOpenGL::SetupVertexInstances(GLuint vao) { } GLintptr RasterizerOpenGL::SetupIndexBuffer() { - if (accelerate_draw != AccelDraw::Indexed) { - return 0; - } MICROPROFILE_SCOPE(OpenGL_Index); const auto& regs = system.GPU().Maxwell3D().regs; const std::size_t size = CalculateIndexBufferSize(); @@ -546,7 +543,8 @@ void RasterizerOpenGL::Clear() { } } -void RasterizerOpenGL::DrawPrelude() { +void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) { + MICROPROFILE_SCOPE(OpenGL_Drawing); auto& gpu = system.GPU().Maxwell3D(); SyncRasterizeEnable(state); @@ -567,9 +565,6 @@ void RasterizerOpenGL::DrawPrelude() { buffer_cache.Acquire(); - // Draw the vertex batch - const bool is_indexed = accelerate_draw == AccelDraw::Indexed; - std::size_t buffer_size = CalculateVertexArraysSize(); // Add space for index buffer @@ -596,7 +591,11 @@ void RasterizerOpenGL::DrawPrelude() { // Upload vertex and index data. SetupVertexBuffer(vao); SetupVertexInstances(vao); - index_buffer_offset = SetupIndexBuffer(); + + GLintptr index_buffer_offset; + if (is_indexed) { + index_buffer_offset = SetupIndexBuffer(); + } // Prepare packed bindings. bind_ubo_pushbuffer.Setup(); @@ -630,6 +629,7 @@ void RasterizerOpenGL::DrawPrelude() { // As all cached buffers are invalidated, we need to recheck their state. gpu.dirty.ResetVertexArrays(); } + gpu.dirty.memory_general = false; shader_program_manager->ApplyTo(state); state.Apply(); @@ -637,106 +637,33 @@ void RasterizerOpenGL::DrawPrelude() { if (texture_cache.TextureBarrier()) { glTextureBarrier(); } -} - -struct DrawParams { - bool is_indexed{}; - bool is_instanced{}; - GLenum primitive_mode{}; - GLint count{}; - GLint base_vertex{}; - - // Indexed settings - GLenum index_format{}; - GLintptr index_buffer_offset{}; - - // Instanced setting - GLint num_instances{}; - GLint base_instance{}; - - void DispatchDraw() { - if (is_indexed) { - const auto index_buffer_ptr = reinterpret_cast<const void*>(index_buffer_offset); - if (is_instanced) { - glDrawElementsInstancedBaseVertexBaseInstance(primitive_mode, count, index_format, - index_buffer_ptr, num_instances, - base_vertex, base_instance); - } else { - glDrawElementsBaseVertex(primitive_mode, count, index_format, index_buffer_ptr, - base_vertex); - } - } else { - if (is_instanced) { - glDrawArraysInstancedBaseInstance(primitive_mode, base_vertex, count, num_instances, - base_instance); - } else { - glDrawArrays(primitive_mode, base_vertex, count); - } - } - } -}; - -bool RasterizerOpenGL::DrawBatch(bool is_indexed) { - accelerate_draw = is_indexed ? AccelDraw::Indexed : AccelDraw::Arrays; - MICROPROFILE_SCOPE(OpenGL_Drawing); - - DrawPrelude(); - - auto& maxwell3d = system.GPU().Maxwell3D(); - const auto& regs = maxwell3d.regs; - const auto current_instance = maxwell3d.state.current_instance; - DrawParams draw_call{}; - draw_call.is_indexed = is_indexed; - draw_call.num_instances = static_cast<GLint>(1); - draw_call.base_instance = static_cast<GLint>(current_instance); - draw_call.is_instanced = current_instance > 0; - draw_call.primitive_mode = MaxwellToGL::PrimitiveTopology(regs.draw.topology); - if (draw_call.is_indexed) { - draw_call.count = static_cast<GLint>(regs.index_array.count); - draw_call.base_vertex = static_cast<GLint>(regs.vb_element_base); - draw_call.index_format = MaxwellToGL::IndexFormat(regs.index_array.format); - draw_call.index_buffer_offset = index_buffer_offset; + const GLuint base_instance = static_cast<GLuint>(gpu.regs.vb_base_instance); + const GLsizei num_instances = + static_cast<GLsizei>(is_instanced ? gpu.mme_draw.instance_count : 1); + if (is_indexed) { + const GLenum index_format = MaxwellToGL::IndexFormat(gpu.regs.index_array.format); + const GLint base_vertex = static_cast<GLint>(gpu.regs.vb_element_base); + const GLsizei num_vertices = static_cast<GLsizei>(gpu.regs.index_array.count); + glDrawElementsInstancedBaseVertexBaseInstance( + primitive_mode, num_vertices, index_format, + reinterpret_cast<const void*>(index_buffer_offset), num_instances, base_vertex, + base_instance); } else { - draw_call.count = static_cast<GLint>(regs.vertex_buffer.count); - draw_call.base_vertex = static_cast<GLint>(regs.vertex_buffer.first); + const GLint base_vertex = static_cast<GLint>(gpu.regs.vertex_buffer.first); + const GLsizei num_vertices = static_cast<GLsizei>(gpu.regs.vertex_buffer.count); + glDrawArraysInstancedBaseInstance(primitive_mode, base_vertex, num_vertices, num_instances, + base_instance); } - draw_call.DispatchDraw(); +} - maxwell3d.dirty.memory_general = false; - accelerate_draw = AccelDraw::Disabled; +bool RasterizerOpenGL::DrawBatch(bool is_indexed) { + Draw(is_indexed, false); return true; } bool RasterizerOpenGL::DrawMultiBatch(bool is_indexed) { - accelerate_draw = is_indexed ? AccelDraw::Indexed : AccelDraw::Arrays; - - MICROPROFILE_SCOPE(OpenGL_Drawing); - - DrawPrelude(); - - auto& maxwell3d = system.GPU().Maxwell3D(); - const auto& regs = maxwell3d.regs; - const auto& draw_setup = maxwell3d.mme_draw; - DrawParams draw_call{}; - draw_call.is_indexed = is_indexed; - draw_call.num_instances = static_cast<GLint>(draw_setup.instance_count); - draw_call.base_instance = static_cast<GLint>(regs.vb_base_instance); - draw_call.is_instanced = draw_setup.instance_count > 1; - draw_call.primitive_mode = MaxwellToGL::PrimitiveTopology(regs.draw.topology); - if (draw_call.is_indexed) { - draw_call.count = static_cast<GLint>(regs.index_array.count); - draw_call.base_vertex = static_cast<GLint>(regs.vb_element_base); - draw_call.index_format = MaxwellToGL::IndexFormat(regs.index_array.format); - draw_call.index_buffer_offset = index_buffer_offset; - } else { - draw_call.count = static_cast<GLint>(regs.vertex_buffer.count); - draw_call.base_vertex = static_cast<GLint>(regs.vertex_buffer.first); - } - draw_call.DispatchDraw(); - - maxwell3d.dirty.memory_general = false; - accelerate_draw = AccelDraw::Disabled; + Draw(is_indexed, true); return true; } @@ -1293,6 +1220,7 @@ void RasterizerOpenGL::SyncPointState() { // Limit the point size to 1 since nouveau sometimes sets a point size of 0 (and that's invalid // in OpenGL). state.point.program_control = regs.vp_point_size.enable != 0; + state.point.sprite = regs.point_sprite_enable != 0; state.point.size = std::max(1.0f, regs.point_size); } diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h index 6a27cf497..0501f3828 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.h +++ b/src/video_core/renderer_opengl/gl_rasterizer.h @@ -103,7 +103,7 @@ private: std::size_t size); /// Syncs all the state, shaders, render targets and textures setting before a draw call. - void DrawPrelude(); + void Draw(bool is_indexed, bool is_instanced); /// Configures the current textures to use for the draw command. void SetupDrawTextures(std::size_t stage_index, const Shader& shader); @@ -220,12 +220,7 @@ private: GLintptr SetupIndexBuffer(); - GLintptr index_buffer_offset; - void SetupShaders(GLenum primitive_mode); - - enum class AccelDraw { Disabled, Arrays, Indexed }; - AccelDraw accelerate_draw = AccelDraw::Disabled; }; } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_state.cpp b/src/video_core/renderer_opengl/gl_state.cpp index cc185e9e1..ab1f7983c 100644 --- a/src/video_core/renderer_opengl/gl_state.cpp +++ b/src/video_core/renderer_opengl/gl_state.cpp @@ -128,6 +128,7 @@ void OpenGLState::ApplyClipDistances() { void OpenGLState::ApplyPointSize() { Enable(GL_PROGRAM_POINT_SIZE, cur_state.point.program_control, point.program_control); + Enable(GL_POINT_SPRITE, cur_state.point.sprite, point.sprite); if (UpdateValue(cur_state.point.size, point.size)) { glPointSize(point.size); } diff --git a/src/video_core/renderer_opengl/gl_state.h b/src/video_core/renderer_opengl/gl_state.h index 678e5cd89..4953eeda2 100644 --- a/src/video_core/renderer_opengl/gl_state.h +++ b/src/video_core/renderer_opengl/gl_state.h @@ -132,6 +132,7 @@ public: struct { bool program_control = false; // GL_PROGRAM_POINT_SIZE + bool sprite = false; // GL_POINT_SPRITE GLfloat size = 1.0f; // GL_POINT_SIZE } point; diff --git a/src/video_core/renderer_opengl/maxwell_to_gl.h b/src/video_core/renderer_opengl/maxwell_to_gl.h index ea4f35663..7ed505628 100644 --- a/src/video_core/renderer_opengl/maxwell_to_gl.h +++ b/src/video_core/renderer_opengl/maxwell_to_gl.h @@ -47,8 +47,7 @@ inline GLenum VertexType(Maxwell::VertexAttribute attrib) { case Maxwell::VertexAttribute::Size::Size_10_10_10_2: return GL_UNSIGNED_INT_2_10_10_10_REV; default: - LOG_CRITICAL(Render_OpenGL, "Unimplemented vertex size={}", attrib.SizeString()); - UNREACHABLE(); + LOG_ERROR(Render_OpenGL, "Unimplemented vertex size={}", attrib.SizeString()); return {}; } case Maxwell::VertexAttribute::Type::SignedInt: @@ -72,8 +71,7 @@ inline GLenum VertexType(Maxwell::VertexAttribute attrib) { case Maxwell::VertexAttribute::Size::Size_10_10_10_2: return GL_INT_2_10_10_10_REV; default: - LOG_CRITICAL(Render_OpenGL, "Unimplemented vertex size={}", attrib.SizeString()); - UNREACHABLE(); + LOG_ERROR(Render_OpenGL, "Unimplemented vertex size={}", attrib.SizeString()); return {}; } case Maxwell::VertexAttribute::Type::Float: @@ -89,13 +87,19 @@ inline GLenum VertexType(Maxwell::VertexAttribute attrib) { case Maxwell::VertexAttribute::Size::Size_32_32_32_32: return GL_FLOAT; default: - LOG_CRITICAL(Render_OpenGL, "Unimplemented vertex size={}", attrib.SizeString()); - UNREACHABLE(); + LOG_ERROR(Render_OpenGL, "Unimplemented vertex size={}", attrib.SizeString()); + return {}; + } + case Maxwell::VertexAttribute::Type::UnsignedScaled: + switch (attrib.size) { + case Maxwell::VertexAttribute::Size::Size_8_8: + return GL_UNSIGNED_BYTE; + default: + LOG_ERROR(Render_OpenGL, "Unimplemented vertex size={}", attrib.SizeString()); return {}; } default: - LOG_CRITICAL(Render_OpenGL, "Unimplemented vertex type={}", attrib.TypeString()); - UNREACHABLE(); + LOG_ERROR(Render_OpenGL, "Unimplemented vertex type={}", attrib.TypeString()); return {}; } } diff --git a/src/video_core/shader/decode/shift.cpp b/src/video_core/shader/decode/shift.cpp index d419e9c45..3b391d3e6 100644 --- a/src/video_core/shader/decode/shift.cpp +++ b/src/video_core/shader/decode/shift.cpp @@ -10,8 +10,80 @@ namespace VideoCommon::Shader { +using std::move; using Tegra::Shader::Instruction; using Tegra::Shader::OpCode; +using Tegra::Shader::ShfType; +using Tegra::Shader::ShfXmode; + +namespace { + +Node IsFull(Node shift) { + return Operation(OperationCode::LogicalIEqual, move(shift), Immediate(32)); +} + +Node Shift(OperationCode opcode, Node value, Node shift) { + Node is_full = Operation(OperationCode::LogicalIEqual, shift, Immediate(32)); + Node shifted = Operation(opcode, move(value), shift); + return Operation(OperationCode::Select, IsFull(move(shift)), Immediate(0), move(shifted)); +} + +Node ClampShift(Node shift, s32 size = 32) { + shift = Operation(OperationCode::IMax, move(shift), Immediate(0)); + return Operation(OperationCode::IMin, move(shift), Immediate(size)); +} + +Node WrapShift(Node shift, s32 size = 32) { + return Operation(OperationCode::UBitwiseAnd, move(shift), Immediate(size - 1)); +} + +Node ShiftRight(Node low, Node high, Node shift, Node low_shift, ShfType type) { + // These values are used when the shift value is less than 32 + Node less_low = Shift(OperationCode::ILogicalShiftRight, low, shift); + Node less_high = Shift(OperationCode::ILogicalShiftLeft, high, low_shift); + Node less = Operation(OperationCode::IBitwiseOr, move(less_high), move(less_low)); + + if (type == ShfType::Bits32) { + // On 32 bit shifts we are either full (shifting 32) or shifting less than 32 bits + return Operation(OperationCode::Select, IsFull(move(shift)), move(high), move(less)); + } + + // And these when it's larger than or 32 + const bool is_signed = type == ShfType::S64; + const auto opcode = SignedToUnsignedCode(OperationCode::IArithmeticShiftRight, is_signed); + Node reduced = Operation(OperationCode::IAdd, shift, Immediate(-32)); + Node greater = Shift(opcode, high, move(reduced)); + + Node is_less = Operation(OperationCode::LogicalILessThan, shift, Immediate(32)); + Node is_zero = Operation(OperationCode::LogicalIEqual, move(shift), Immediate(0)); + + Node value = Operation(OperationCode::Select, move(is_less), move(less), move(greater)); + return Operation(OperationCode::Select, move(is_zero), move(high), move(value)); +} + +Node ShiftLeft(Node low, Node high, Node shift, Node low_shift, ShfType type) { + // These values are used when the shift value is less than 32 + Node less_low = Operation(OperationCode::ILogicalShiftRight, low, low_shift); + Node less_high = Operation(OperationCode::ILogicalShiftLeft, high, shift); + Node less = Operation(OperationCode::IBitwiseOr, move(less_low), move(less_high)); + + if (type == ShfType::Bits32) { + // On 32 bit shifts we are either full (shifting 32) or shifting less than 32 bits + return Operation(OperationCode::Select, IsFull(move(shift)), move(low), move(less)); + } + + // And these when it's larger than or 32 + Node reduced = Operation(OperationCode::IAdd, shift, Immediate(-32)); + Node greater = Shift(OperationCode::ILogicalShiftLeft, move(low), move(reduced)); + + Node is_less = Operation(OperationCode::LogicalILessThan, shift, Immediate(32)); + Node is_zero = Operation(OperationCode::LogicalIEqual, move(shift), Immediate(0)); + + Node value = Operation(OperationCode::Select, move(is_less), move(less), move(greater)); + return Operation(OperationCode::Select, move(is_zero), move(high), move(value)); +} + +} // Anonymous namespace u32 ShaderIR::DecodeShift(NodeBlock& bb, u32 pc) { const Instruction instr = {program_code[pc]}; @@ -28,29 +100,48 @@ u32 ShaderIR::DecodeShift(NodeBlock& bb, u32 pc) { } }(); - switch (opcode->get().GetId()) { + switch (const auto opid = opcode->get().GetId(); opid) { case OpCode::Id::SHR_C: case OpCode::Id::SHR_R: case OpCode::Id::SHR_IMM: { - if (instr.shr.wrap) { - op_b = Operation(OperationCode::UBitwiseAnd, std::move(op_b), Immediate(0x1f)); - } else { - op_b = Operation(OperationCode::IMax, std::move(op_b), Immediate(0)); - op_b = Operation(OperationCode::IMin, std::move(op_b), Immediate(31)); - } + op_b = instr.shr.wrap ? WrapShift(move(op_b)) : ClampShift(move(op_b)); Node value = SignedOperation(OperationCode::IArithmeticShiftRight, instr.shift.is_signed, - std::move(op_a), std::move(op_b)); + move(op_a), move(op_b)); SetInternalFlagsFromInteger(bb, value, instr.generates_cc); - SetRegister(bb, instr.gpr0, std::move(value)); + SetRegister(bb, instr.gpr0, move(value)); break; } case OpCode::Id::SHL_C: case OpCode::Id::SHL_R: case OpCode::Id::SHL_IMM: { - const Node value = Operation(OperationCode::ILogicalShiftLeft, op_a, op_b); + Node value = Operation(OperationCode::ILogicalShiftLeft, op_a, op_b); SetInternalFlagsFromInteger(bb, value, instr.generates_cc); - SetRegister(bb, instr.gpr0, value); + SetRegister(bb, instr.gpr0, move(value)); + break; + } + case OpCode::Id::SHF_RIGHT_R: + case OpCode::Id::SHF_RIGHT_IMM: + case OpCode::Id::SHF_LEFT_R: + case OpCode::Id::SHF_LEFT_IMM: { + UNIMPLEMENTED_IF(instr.generates_cc); + UNIMPLEMENTED_IF_MSG(instr.shf.xmode != ShfXmode::None, "xmode={}", + static_cast<int>(instr.shf.xmode.Value())); + + if (instr.is_b_imm) { + op_b = Immediate(static_cast<u32>(instr.shf.immediate)); + } + const s32 size = instr.shf.type == ShfType::Bits32 ? 32 : 64; + Node shift = instr.shf.wrap ? WrapShift(move(op_b), size) : ClampShift(move(op_b), size); + + Node negated_shift = Operation(OperationCode::INegate, shift); + Node low_shift = Operation(OperationCode::IAdd, move(negated_shift), Immediate(32)); + + const bool is_right = opid == OpCode::Id::SHF_RIGHT_R || opid == OpCode::Id::SHF_RIGHT_IMM; + Node value = (is_right ? ShiftRight : ShiftLeft)( + move(op_a), GetRegister(instr.gpr39), move(shift), move(low_shift), instr.shf.type); + + SetRegister(bb, instr.gpr0, move(value)); break; } default: |