summaryrefslogtreecommitdiffstats
path: root/src/video_core
diff options
context:
space:
mode:
Diffstat (limited to 'src/video_core')
-rw-r--r--src/video_core/engines/maxwell_3d.cpp91
-rw-r--r--src/video_core/engines/maxwell_3d.h29
-rw-r--r--src/video_core/engines/shader_bytecode.h20
-rw-r--r--src/video_core/gpu.cpp18
-rw-r--r--src/video_core/gpu.h2
-rw-r--r--src/video_core/gpu_thread.h2
-rw-r--r--src/video_core/memory_manager.cpp14
-rw-r--r--src/video_core/memory_manager.h7
-rw-r--r--src/video_core/renderer_opengl/gl_rasterizer.cpp128
-rw-r--r--src/video_core/renderer_opengl/gl_rasterizer.h7
-rw-r--r--src/video_core/renderer_opengl/gl_state.cpp1
-rw-r--r--src/video_core/renderer_opengl/gl_state.h1
-rw-r--r--src/video_core/renderer_opengl/maxwell_to_gl.h20
-rw-r--r--src/video_core/shader/decode/shift.cpp113
14 files changed, 257 insertions, 196 deletions
diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp
index 7cea146f0..0b3e8749b 100644
--- a/src/video_core/engines/maxwell_3d.cpp
+++ b/src/video_core/engines/maxwell_3d.cpp
@@ -9,6 +9,7 @@
#include "core/core_timing.h"
#include "video_core/engines/maxwell_3d.h"
#include "video_core/engines/shader_type.h"
+#include "video_core/gpu.h"
#include "video_core/memory_manager.h"
#include "video_core/rasterizer_interface.h"
#include "video_core/textures/texture.h"
@@ -519,61 +520,63 @@ void Maxwell3D::ProcessFirmwareCall4() {
regs.reg_array[0xd00] = 1;
}
-void Maxwell3D::ProcessQueryGet() {
+void Maxwell3D::StampQueryResult(u64 payload, bool long_query) {
+ struct LongQueryResult {
+ u64_le value;
+ u64_le timestamp;
+ };
+ static_assert(sizeof(LongQueryResult) == 16, "LongQueryResult has wrong size");
const GPUVAddr sequence_address{regs.query.QueryAddress()};
- // Since the sequence address is given as a GPU VAddr, we have to convert it to an application
- // VAddr before writing.
+ if (long_query) {
+ // Write the 128-bit result structure in long mode. Note: We emulate an infinitely fast
+ // GPU, this command may actually take a while to complete in real hardware due to GPU
+ // wait queues.
+ LongQueryResult query_result{payload, system.GPU().GetTicks()};
+ memory_manager.WriteBlock(sequence_address, &query_result, sizeof(query_result));
+ } else {
+ memory_manager.Write<u32>(sequence_address, static_cast<u32>(payload));
+ }
+}
+void Maxwell3D::ProcessQueryGet() {
// TODO(Subv): Support the other query units.
ASSERT_MSG(regs.query.query_get.unit == Regs::QueryUnit::Crop,
"Units other than CROP are unimplemented");
- u64 result = 0;
-
- // TODO(Subv): Support the other query variables
- switch (regs.query.query_get.select) {
- case Regs::QuerySelect::Zero:
- // This seems to actually write the query sequence to the query address.
- result = regs.query.query_sequence;
+ switch (regs.query.query_get.operation) {
+ case Regs::QueryOperation::Release: {
+ const u64 result = regs.query.query_sequence;
+ StampQueryResult(result, regs.query.query_get.short_query == 0);
break;
- default:
- result = 1;
- UNIMPLEMENTED_MSG("Unimplemented query select type {}",
- static_cast<u32>(regs.query.query_get.select.Value()));
}
-
- // TODO(Subv): Research and implement how query sync conditions work.
-
- struct LongQueryResult {
- u64_le value;
- u64_le timestamp;
- };
- static_assert(sizeof(LongQueryResult) == 16, "LongQueryResult has wrong size");
-
- switch (regs.query.query_get.mode) {
- case Regs::QueryMode::Write:
- case Regs::QueryMode::Write2: {
- u32 sequence = regs.query.query_sequence;
- if (regs.query.query_get.short_query) {
- // Write the current query sequence to the sequence address.
- // TODO(Subv): Find out what happens if you use a long query type but mark it as a short
- // query.
- memory_manager.Write<u32>(sequence_address, sequence);
- } else {
- // Write the 128-bit result structure in long mode. Note: We emulate an infinitely fast
- // GPU, this command may actually take a while to complete in real hardware due to GPU
- // wait queues.
- LongQueryResult query_result{};
- query_result.value = result;
- // TODO(Subv): Generate a real GPU timestamp and write it here instead of CoreTiming
- query_result.timestamp = system.CoreTiming().GetTicks();
- memory_manager.WriteBlock(sequence_address, &query_result, sizeof(query_result));
+ case Regs::QueryOperation::Acquire: {
+ // Todo(Blinkhawk): Under this operation, the GPU waits for the CPU
+ // to write a value that matches the current payload.
+ UNIMPLEMENTED_MSG("Unimplemented query operation ACQUIRE");
+ break;
+ }
+ case Regs::QueryOperation::Counter: {
+ u64 result{};
+ switch (regs.query.query_get.select) {
+ case Regs::QuerySelect::Zero:
+ result = 0;
+ break;
+ default:
+ result = 1;
+ UNIMPLEMENTED_MSG("Unimplemented query select type {}",
+ static_cast<u32>(regs.query.query_get.select.Value()));
}
+ StampQueryResult(result, regs.query.query_get.short_query == 0);
+ break;
+ }
+ case Regs::QueryOperation::Trap: {
+ UNIMPLEMENTED_MSG("Unimplemented query operation TRAP");
+ break;
+ }
+ default: {
+ UNIMPLEMENTED_MSG("Unknown query operation");
break;
}
- default:
- UNIMPLEMENTED_MSG("Query mode {} not implemented",
- static_cast<u32>(regs.query.query_get.mode.Value()));
}
}
diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h
index 8808bbf76..0a2af54e5 100644
--- a/src/video_core/engines/maxwell_3d.h
+++ b/src/video_core/engines/maxwell_3d.h
@@ -71,12 +71,11 @@ public:
static constexpr std::size_t MaxConstBuffers = 18;
static constexpr std::size_t MaxConstBufferSize = 0x10000;
- enum class QueryMode : u32 {
- Write = 0,
- Sync = 1,
- // TODO(Subv): It is currently unknown what the difference between method 2 and method 0
- // is.
- Write2 = 2,
+ enum class QueryOperation : u32 {
+ Release = 0,
+ Acquire = 1,
+ Counter = 2,
+ Trap = 3,
};
enum class QueryUnit : u32 {
@@ -704,8 +703,8 @@ public:
INSERT_UNION_PADDING_WORDS(0x15);
s32 stencil_back_func_ref;
- u32 stencil_back_func_mask;
u32 stencil_back_mask;
+ u32 stencil_back_func_mask;
INSERT_UNION_PADDING_WORDS(0xC);
@@ -862,7 +861,11 @@ public:
float point_size;
- INSERT_UNION_PADDING_WORDS(0x7);
+ INSERT_UNION_PADDING_WORDS(0x1);
+
+ u32 point_sprite_enable;
+
+ INSERT_UNION_PADDING_WORDS(0x5);
u32 zeta_enable;
@@ -1077,7 +1080,7 @@ public:
u32 query_sequence;
union {
u32 raw;
- BitField<0, 2, QueryMode> mode;
+ BitField<0, 2, QueryOperation> operation;
BitField<4, 1, u32> fence;
BitField<12, 4, QueryUnit> unit;
BitField<16, 1, QuerySyncCondition> sync_cond;
@@ -1409,6 +1412,9 @@ private:
/// Handles a write to the QUERY_GET register.
void ProcessQueryGet();
+ // Writes the query result accordingly
+ void StampQueryResult(u64 payload, bool long_query);
+
// Handles Conditional Rendering
void ProcessQueryCondition();
@@ -1458,8 +1464,8 @@ ASSERT_REG_POSITION(polygon_offset_fill_enable, 0x372);
ASSERT_REG_POSITION(patch_vertices, 0x373);
ASSERT_REG_POSITION(scissor_test, 0x380);
ASSERT_REG_POSITION(stencil_back_func_ref, 0x3D5);
-ASSERT_REG_POSITION(stencil_back_func_mask, 0x3D6);
-ASSERT_REG_POSITION(stencil_back_mask, 0x3D7);
+ASSERT_REG_POSITION(stencil_back_mask, 0x3D6);
+ASSERT_REG_POSITION(stencil_back_func_mask, 0x3D7);
ASSERT_REG_POSITION(color_mask_common, 0x3E4);
ASSERT_REG_POSITION(rt_separate_frag_data, 0x3EB);
ASSERT_REG_POSITION(depth_bounds, 0x3E7);
@@ -1494,6 +1500,7 @@ ASSERT_REG_POSITION(vb_element_base, 0x50D);
ASSERT_REG_POSITION(vb_base_instance, 0x50E);
ASSERT_REG_POSITION(clip_distance_enabled, 0x544);
ASSERT_REG_POSITION(point_size, 0x546);
+ASSERT_REG_POSITION(point_sprite_enable, 0x548);
ASSERT_REG_POSITION(zeta_enable, 0x54E);
ASSERT_REG_POSITION(multisample_control, 0x54F);
ASSERT_REG_POSITION(condition, 0x554);
diff --git a/src/video_core/engines/shader_bytecode.h b/src/video_core/engines/shader_bytecode.h
index 955f981bd..c9bc83cd7 100644
--- a/src/video_core/engines/shader_bytecode.h
+++ b/src/video_core/engines/shader_bytecode.h
@@ -624,6 +624,19 @@ enum class ShuffleOperation : u64 {
Bfly = 3, // shuffleXorNV
};
+enum class ShfType : u64 {
+ Bits32 = 0,
+ U64 = 2,
+ S64 = 3,
+};
+
+enum class ShfXmode : u64 {
+ None = 0,
+ HI = 1,
+ X = 2,
+ XHI = 3,
+};
+
union Instruction {
constexpr Instruction& operator=(const Instruction& instr) {
value = instr.value;
@@ -776,6 +789,13 @@ union Instruction {
} shr;
union {
+ BitField<37, 2, ShfType> type;
+ BitField<48, 2, ShfXmode> xmode;
+ BitField<50, 1, u64> wrap;
+ BitField<20, 6, u64> immediate;
+ } shf;
+
+ union {
BitField<39, 5, u64> shift_amount;
BitField<48, 1, u64> negate_b;
BitField<49, 1, u64> negate_a;
diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp
index b9c5c41a2..4419ab735 100644
--- a/src/video_core/gpu.cpp
+++ b/src/video_core/gpu.cpp
@@ -6,6 +6,7 @@
#include "common/microprofile.h"
#include "core/core.h"
#include "core/core_timing.h"
+#include "core/core_timing_util.h"
#include "core/memory.h"
#include "video_core/engines/fermi_2d.h"
#include "video_core/engines/kepler_compute.h"
@@ -23,7 +24,7 @@ MICROPROFILE_DEFINE(GPU_wait, "GPU", "Wait for the GPU", MP_RGB(128, 128, 192));
GPU::GPU(Core::System& system, VideoCore::RendererBase& renderer, bool is_async)
: system{system}, renderer{renderer}, is_async{is_async} {
auto& rasterizer{renderer.Rasterizer()};
- memory_manager = std::make_unique<Tegra::MemoryManager>(system, rasterizer);
+ memory_manager = std::make_unique<Tegra::MemoryManager>(system);
dma_pusher = std::make_unique<Tegra::DmaPusher>(*this);
maxwell_3d = std::make_unique<Engines::Maxwell3D>(system, rasterizer, *memory_manager);
fermi_2d = std::make_unique<Engines::Fermi2D>(rasterizer);
@@ -122,6 +123,19 @@ bool GPU::CancelSyncptInterrupt(const u32 syncpoint_id, const u32 value) {
return true;
}
+u64 GPU::GetTicks() const {
+ // This values were reversed engineered by fincs from NVN
+ // The gpu clock is reported in units of 385/625 nanoseconds
+ constexpr u64 gpu_ticks_num = 384;
+ constexpr u64 gpu_ticks_den = 625;
+
+ const u64 cpu_ticks = system.CoreTiming().GetTicks();
+ const u64 nanoseconds = Core::Timing::CyclesToNs(cpu_ticks).count();
+ const u64 nanoseconds_num = nanoseconds / gpu_ticks_den;
+ const u64 nanoseconds_rem = nanoseconds % gpu_ticks_den;
+ return nanoseconds_num * gpu_ticks_num + (nanoseconds_rem * gpu_ticks_num) / gpu_ticks_den;
+}
+
void GPU::FlushCommands() {
renderer.Rasterizer().FlushCommands();
}
@@ -340,7 +354,7 @@ void GPU::ProcessSemaphoreTriggerMethod() {
block.sequence = regs.semaphore_sequence;
// TODO(Kmather73): Generate a real GPU timestamp and write it here instead of
// CoreTiming
- block.timestamp = system.CoreTiming().GetTicks();
+ block.timestamp = GetTicks();
memory_manager->WriteBlock(regs.semaphore_address.SemaphoreAddress(), &block,
sizeof(block));
} else {
diff --git a/src/video_core/gpu.h b/src/video_core/gpu.h
index b648317bb..07727210c 100644
--- a/src/video_core/gpu.h
+++ b/src/video_core/gpu.h
@@ -192,6 +192,8 @@ public:
bool CancelSyncptInterrupt(u32 syncpoint_id, u32 value);
+ u64 GetTicks() const;
+
std::unique_lock<std::mutex> LockSync() {
return std::unique_lock{sync_mutex};
}
diff --git a/src/video_core/gpu_thread.h b/src/video_core/gpu_thread.h
index 08dc96bb3..882e2d9c7 100644
--- a/src/video_core/gpu_thread.h
+++ b/src/video_core/gpu_thread.h
@@ -86,7 +86,7 @@ struct CommandDataContainer {
struct SynchState final {
std::atomic_bool is_running{true};
- using CommandQueue = Common::SPSCQueue<CommandDataContainer>;
+ using CommandQueue = Common::MPSCQueue<CommandDataContainer>;
CommandQueue queue;
u64 last_fence{};
std::atomic<u64> signaled_fence{};
diff --git a/src/video_core/memory_manager.cpp b/src/video_core/memory_manager.cpp
index 11848fbce..f1d50be3e 100644
--- a/src/video_core/memory_manager.cpp
+++ b/src/video_core/memory_manager.cpp
@@ -9,13 +9,12 @@
#include "core/hle/kernel/process.h"
#include "core/hle/kernel/vm_manager.h"
#include "core/memory.h"
+#include "video_core/gpu.h"
#include "video_core/memory_manager.h"
-#include "video_core/rasterizer_interface.h"
namespace Tegra {
-MemoryManager::MemoryManager(Core::System& system, VideoCore::RasterizerInterface& rasterizer)
- : rasterizer{rasterizer}, system{system} {
+MemoryManager::MemoryManager(Core::System& system) : system{system} {
std::fill(page_table.pointers.begin(), page_table.pointers.end(), nullptr);
std::fill(page_table.attributes.begin(), page_table.attributes.end(),
Common::PageType::Unmapped);
@@ -84,7 +83,8 @@ GPUVAddr MemoryManager::UnmapBuffer(GPUVAddr gpu_addr, u64 size) {
const auto cpu_addr = GpuToCpuAddress(gpu_addr);
ASSERT(cpu_addr);
- rasterizer.FlushAndInvalidateRegion(cache_addr, aligned_size);
+ system.GPU().FlushAndInvalidateRegion(cache_addr, aligned_size);
+
UnmapRange(gpu_addr, aligned_size);
ASSERT(system.CurrentProcess()
->VMManager()
@@ -242,7 +242,7 @@ void MemoryManager::ReadBlock(GPUVAddr src_addr, void* dest_buffer, const std::s
switch (page_table.attributes[page_index]) {
case Common::PageType::Memory: {
const u8* src_ptr{page_table.pointers[page_index] + page_offset};
- rasterizer.FlushRegion(ToCacheAddr(src_ptr), copy_amount);
+ system.GPU().FlushRegion(ToCacheAddr(src_ptr), copy_amount);
std::memcpy(dest_buffer, src_ptr, copy_amount);
break;
}
@@ -292,7 +292,7 @@ void MemoryManager::WriteBlock(GPUVAddr dest_addr, const void* src_buffer, const
switch (page_table.attributes[page_index]) {
case Common::PageType::Memory: {
u8* dest_ptr{page_table.pointers[page_index] + page_offset};
- rasterizer.InvalidateRegion(ToCacheAddr(dest_ptr), copy_amount);
+ system.GPU().InvalidateRegion(ToCacheAddr(dest_ptr), copy_amount);
std::memcpy(dest_ptr, src_buffer, copy_amount);
break;
}
@@ -340,7 +340,7 @@ void MemoryManager::CopyBlock(GPUVAddr dest_addr, GPUVAddr src_addr, const std::
switch (page_table.attributes[page_index]) {
case Common::PageType::Memory: {
const u8* src_ptr{page_table.pointers[page_index] + page_offset};
- rasterizer.FlushRegion(ToCacheAddr(src_ptr), copy_amount);
+ system.GPU().FlushRegion(ToCacheAddr(src_ptr), copy_amount);
WriteBlock(dest_addr, src_ptr, copy_amount);
break;
}
diff --git a/src/video_core/memory_manager.h b/src/video_core/memory_manager.h
index aea010087..393447eb4 100644
--- a/src/video_core/memory_manager.h
+++ b/src/video_core/memory_manager.h
@@ -10,10 +10,6 @@
#include "common/common_types.h"
#include "common/page_table.h"
-namespace VideoCore {
-class RasterizerInterface;
-}
-
namespace Core {
class System;
}
@@ -51,7 +47,7 @@ struct VirtualMemoryArea {
class MemoryManager final {
public:
- explicit MemoryManager(Core::System& system, VideoCore::RasterizerInterface& rasterizer);
+ explicit MemoryManager(Core::System& system);
~MemoryManager();
GPUVAddr AllocateSpace(u64 size, u64 align);
@@ -176,7 +172,6 @@ private:
Common::PageTable page_table{page_bits};
VMAMap vma_map;
- VideoCore::RasterizerInterface& rasterizer;
Core::System& system;
};
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp
index 362942e09..b0eb14c8b 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -248,9 +248,6 @@ void RasterizerOpenGL::SetupVertexInstances(GLuint vao) {
}
GLintptr RasterizerOpenGL::SetupIndexBuffer() {
- if (accelerate_draw != AccelDraw::Indexed) {
- return 0;
- }
MICROPROFILE_SCOPE(OpenGL_Index);
const auto& regs = system.GPU().Maxwell3D().regs;
const std::size_t size = CalculateIndexBufferSize();
@@ -546,7 +543,8 @@ void RasterizerOpenGL::Clear() {
}
}
-void RasterizerOpenGL::DrawPrelude() {
+void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) {
+ MICROPROFILE_SCOPE(OpenGL_Drawing);
auto& gpu = system.GPU().Maxwell3D();
SyncRasterizeEnable(state);
@@ -567,9 +565,6 @@ void RasterizerOpenGL::DrawPrelude() {
buffer_cache.Acquire();
- // Draw the vertex batch
- const bool is_indexed = accelerate_draw == AccelDraw::Indexed;
-
std::size_t buffer_size = CalculateVertexArraysSize();
// Add space for index buffer
@@ -596,7 +591,11 @@ void RasterizerOpenGL::DrawPrelude() {
// Upload vertex and index data.
SetupVertexBuffer(vao);
SetupVertexInstances(vao);
- index_buffer_offset = SetupIndexBuffer();
+
+ GLintptr index_buffer_offset;
+ if (is_indexed) {
+ index_buffer_offset = SetupIndexBuffer();
+ }
// Prepare packed bindings.
bind_ubo_pushbuffer.Setup();
@@ -630,6 +629,7 @@ void RasterizerOpenGL::DrawPrelude() {
// As all cached buffers are invalidated, we need to recheck their state.
gpu.dirty.ResetVertexArrays();
}
+ gpu.dirty.memory_general = false;
shader_program_manager->ApplyTo(state);
state.Apply();
@@ -637,106 +637,33 @@ void RasterizerOpenGL::DrawPrelude() {
if (texture_cache.TextureBarrier()) {
glTextureBarrier();
}
-}
-
-struct DrawParams {
- bool is_indexed{};
- bool is_instanced{};
- GLenum primitive_mode{};
- GLint count{};
- GLint base_vertex{};
-
- // Indexed settings
- GLenum index_format{};
- GLintptr index_buffer_offset{};
-
- // Instanced setting
- GLint num_instances{};
- GLint base_instance{};
-
- void DispatchDraw() {
- if (is_indexed) {
- const auto index_buffer_ptr = reinterpret_cast<const void*>(index_buffer_offset);
- if (is_instanced) {
- glDrawElementsInstancedBaseVertexBaseInstance(primitive_mode, count, index_format,
- index_buffer_ptr, num_instances,
- base_vertex, base_instance);
- } else {
- glDrawElementsBaseVertex(primitive_mode, count, index_format, index_buffer_ptr,
- base_vertex);
- }
- } else {
- if (is_instanced) {
- glDrawArraysInstancedBaseInstance(primitive_mode, base_vertex, count, num_instances,
- base_instance);
- } else {
- glDrawArrays(primitive_mode, base_vertex, count);
- }
- }
- }
-};
-
-bool RasterizerOpenGL::DrawBatch(bool is_indexed) {
- accelerate_draw = is_indexed ? AccelDraw::Indexed : AccelDraw::Arrays;
- MICROPROFILE_SCOPE(OpenGL_Drawing);
-
- DrawPrelude();
-
- auto& maxwell3d = system.GPU().Maxwell3D();
- const auto& regs = maxwell3d.regs;
- const auto current_instance = maxwell3d.state.current_instance;
- DrawParams draw_call{};
- draw_call.is_indexed = is_indexed;
- draw_call.num_instances = static_cast<GLint>(1);
- draw_call.base_instance = static_cast<GLint>(current_instance);
- draw_call.is_instanced = current_instance > 0;
- draw_call.primitive_mode = MaxwellToGL::PrimitiveTopology(regs.draw.topology);
- if (draw_call.is_indexed) {
- draw_call.count = static_cast<GLint>(regs.index_array.count);
- draw_call.base_vertex = static_cast<GLint>(regs.vb_element_base);
- draw_call.index_format = MaxwellToGL::IndexFormat(regs.index_array.format);
- draw_call.index_buffer_offset = index_buffer_offset;
+ const GLuint base_instance = static_cast<GLuint>(gpu.regs.vb_base_instance);
+ const GLsizei num_instances =
+ static_cast<GLsizei>(is_instanced ? gpu.mme_draw.instance_count : 1);
+ if (is_indexed) {
+ const GLenum index_format = MaxwellToGL::IndexFormat(gpu.regs.index_array.format);
+ const GLint base_vertex = static_cast<GLint>(gpu.regs.vb_element_base);
+ const GLsizei num_vertices = static_cast<GLsizei>(gpu.regs.index_array.count);
+ glDrawElementsInstancedBaseVertexBaseInstance(
+ primitive_mode, num_vertices, index_format,
+ reinterpret_cast<const void*>(index_buffer_offset), num_instances, base_vertex,
+ base_instance);
} else {
- draw_call.count = static_cast<GLint>(regs.vertex_buffer.count);
- draw_call.base_vertex = static_cast<GLint>(regs.vertex_buffer.first);
+ const GLint base_vertex = static_cast<GLint>(gpu.regs.vertex_buffer.first);
+ const GLsizei num_vertices = static_cast<GLsizei>(gpu.regs.vertex_buffer.count);
+ glDrawArraysInstancedBaseInstance(primitive_mode, base_vertex, num_vertices, num_instances,
+ base_instance);
}
- draw_call.DispatchDraw();
+}
- maxwell3d.dirty.memory_general = false;
- accelerate_draw = AccelDraw::Disabled;
+bool RasterizerOpenGL::DrawBatch(bool is_indexed) {
+ Draw(is_indexed, false);
return true;
}
bool RasterizerOpenGL::DrawMultiBatch(bool is_indexed) {
- accelerate_draw = is_indexed ? AccelDraw::Indexed : AccelDraw::Arrays;
-
- MICROPROFILE_SCOPE(OpenGL_Drawing);
-
- DrawPrelude();
-
- auto& maxwell3d = system.GPU().Maxwell3D();
- const auto& regs = maxwell3d.regs;
- const auto& draw_setup = maxwell3d.mme_draw;
- DrawParams draw_call{};
- draw_call.is_indexed = is_indexed;
- draw_call.num_instances = static_cast<GLint>(draw_setup.instance_count);
- draw_call.base_instance = static_cast<GLint>(regs.vb_base_instance);
- draw_call.is_instanced = draw_setup.instance_count > 1;
- draw_call.primitive_mode = MaxwellToGL::PrimitiveTopology(regs.draw.topology);
- if (draw_call.is_indexed) {
- draw_call.count = static_cast<GLint>(regs.index_array.count);
- draw_call.base_vertex = static_cast<GLint>(regs.vb_element_base);
- draw_call.index_format = MaxwellToGL::IndexFormat(regs.index_array.format);
- draw_call.index_buffer_offset = index_buffer_offset;
- } else {
- draw_call.count = static_cast<GLint>(regs.vertex_buffer.count);
- draw_call.base_vertex = static_cast<GLint>(regs.vertex_buffer.first);
- }
- draw_call.DispatchDraw();
-
- maxwell3d.dirty.memory_general = false;
- accelerate_draw = AccelDraw::Disabled;
+ Draw(is_indexed, true);
return true;
}
@@ -1293,6 +1220,7 @@ void RasterizerOpenGL::SyncPointState() {
// Limit the point size to 1 since nouveau sometimes sets a point size of 0 (and that's invalid
// in OpenGL).
state.point.program_control = regs.vp_point_size.enable != 0;
+ state.point.sprite = regs.point_sprite_enable != 0;
state.point.size = std::max(1.0f, regs.point_size);
}
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h
index 6a27cf497..0501f3828 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer.h
@@ -103,7 +103,7 @@ private:
std::size_t size);
/// Syncs all the state, shaders, render targets and textures setting before a draw call.
- void DrawPrelude();
+ void Draw(bool is_indexed, bool is_instanced);
/// Configures the current textures to use for the draw command.
void SetupDrawTextures(std::size_t stage_index, const Shader& shader);
@@ -220,12 +220,7 @@ private:
GLintptr SetupIndexBuffer();
- GLintptr index_buffer_offset;
-
void SetupShaders(GLenum primitive_mode);
-
- enum class AccelDraw { Disabled, Arrays, Indexed };
- AccelDraw accelerate_draw = AccelDraw::Disabled;
};
} // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_state.cpp b/src/video_core/renderer_opengl/gl_state.cpp
index cc185e9e1..ab1f7983c 100644
--- a/src/video_core/renderer_opengl/gl_state.cpp
+++ b/src/video_core/renderer_opengl/gl_state.cpp
@@ -128,6 +128,7 @@ void OpenGLState::ApplyClipDistances() {
void OpenGLState::ApplyPointSize() {
Enable(GL_PROGRAM_POINT_SIZE, cur_state.point.program_control, point.program_control);
+ Enable(GL_POINT_SPRITE, cur_state.point.sprite, point.sprite);
if (UpdateValue(cur_state.point.size, point.size)) {
glPointSize(point.size);
}
diff --git a/src/video_core/renderer_opengl/gl_state.h b/src/video_core/renderer_opengl/gl_state.h
index 678e5cd89..4953eeda2 100644
--- a/src/video_core/renderer_opengl/gl_state.h
+++ b/src/video_core/renderer_opengl/gl_state.h
@@ -132,6 +132,7 @@ public:
struct {
bool program_control = false; // GL_PROGRAM_POINT_SIZE
+ bool sprite = false; // GL_POINT_SPRITE
GLfloat size = 1.0f; // GL_POINT_SIZE
} point;
diff --git a/src/video_core/renderer_opengl/maxwell_to_gl.h b/src/video_core/renderer_opengl/maxwell_to_gl.h
index ea4f35663..7ed505628 100644
--- a/src/video_core/renderer_opengl/maxwell_to_gl.h
+++ b/src/video_core/renderer_opengl/maxwell_to_gl.h
@@ -47,8 +47,7 @@ inline GLenum VertexType(Maxwell::VertexAttribute attrib) {
case Maxwell::VertexAttribute::Size::Size_10_10_10_2:
return GL_UNSIGNED_INT_2_10_10_10_REV;
default:
- LOG_CRITICAL(Render_OpenGL, "Unimplemented vertex size={}", attrib.SizeString());
- UNREACHABLE();
+ LOG_ERROR(Render_OpenGL, "Unimplemented vertex size={}", attrib.SizeString());
return {};
}
case Maxwell::VertexAttribute::Type::SignedInt:
@@ -72,8 +71,7 @@ inline GLenum VertexType(Maxwell::VertexAttribute attrib) {
case Maxwell::VertexAttribute::Size::Size_10_10_10_2:
return GL_INT_2_10_10_10_REV;
default:
- LOG_CRITICAL(Render_OpenGL, "Unimplemented vertex size={}", attrib.SizeString());
- UNREACHABLE();
+ LOG_ERROR(Render_OpenGL, "Unimplemented vertex size={}", attrib.SizeString());
return {};
}
case Maxwell::VertexAttribute::Type::Float:
@@ -89,13 +87,19 @@ inline GLenum VertexType(Maxwell::VertexAttribute attrib) {
case Maxwell::VertexAttribute::Size::Size_32_32_32_32:
return GL_FLOAT;
default:
- LOG_CRITICAL(Render_OpenGL, "Unimplemented vertex size={}", attrib.SizeString());
- UNREACHABLE();
+ LOG_ERROR(Render_OpenGL, "Unimplemented vertex size={}", attrib.SizeString());
+ return {};
+ }
+ case Maxwell::VertexAttribute::Type::UnsignedScaled:
+ switch (attrib.size) {
+ case Maxwell::VertexAttribute::Size::Size_8_8:
+ return GL_UNSIGNED_BYTE;
+ default:
+ LOG_ERROR(Render_OpenGL, "Unimplemented vertex size={}", attrib.SizeString());
return {};
}
default:
- LOG_CRITICAL(Render_OpenGL, "Unimplemented vertex type={}", attrib.TypeString());
- UNREACHABLE();
+ LOG_ERROR(Render_OpenGL, "Unimplemented vertex type={}", attrib.TypeString());
return {};
}
}
diff --git a/src/video_core/shader/decode/shift.cpp b/src/video_core/shader/decode/shift.cpp
index d419e9c45..3b391d3e6 100644
--- a/src/video_core/shader/decode/shift.cpp
+++ b/src/video_core/shader/decode/shift.cpp
@@ -10,8 +10,80 @@
namespace VideoCommon::Shader {
+using std::move;
using Tegra::Shader::Instruction;
using Tegra::Shader::OpCode;
+using Tegra::Shader::ShfType;
+using Tegra::Shader::ShfXmode;
+
+namespace {
+
+Node IsFull(Node shift) {
+ return Operation(OperationCode::LogicalIEqual, move(shift), Immediate(32));
+}
+
+Node Shift(OperationCode opcode, Node value, Node shift) {
+ Node is_full = Operation(OperationCode::LogicalIEqual, shift, Immediate(32));
+ Node shifted = Operation(opcode, move(value), shift);
+ return Operation(OperationCode::Select, IsFull(move(shift)), Immediate(0), move(shifted));
+}
+
+Node ClampShift(Node shift, s32 size = 32) {
+ shift = Operation(OperationCode::IMax, move(shift), Immediate(0));
+ return Operation(OperationCode::IMin, move(shift), Immediate(size));
+}
+
+Node WrapShift(Node shift, s32 size = 32) {
+ return Operation(OperationCode::UBitwiseAnd, move(shift), Immediate(size - 1));
+}
+
+Node ShiftRight(Node low, Node high, Node shift, Node low_shift, ShfType type) {
+ // These values are used when the shift value is less than 32
+ Node less_low = Shift(OperationCode::ILogicalShiftRight, low, shift);
+ Node less_high = Shift(OperationCode::ILogicalShiftLeft, high, low_shift);
+ Node less = Operation(OperationCode::IBitwiseOr, move(less_high), move(less_low));
+
+ if (type == ShfType::Bits32) {
+ // On 32 bit shifts we are either full (shifting 32) or shifting less than 32 bits
+ return Operation(OperationCode::Select, IsFull(move(shift)), move(high), move(less));
+ }
+
+ // And these when it's larger than or 32
+ const bool is_signed = type == ShfType::S64;
+ const auto opcode = SignedToUnsignedCode(OperationCode::IArithmeticShiftRight, is_signed);
+ Node reduced = Operation(OperationCode::IAdd, shift, Immediate(-32));
+ Node greater = Shift(opcode, high, move(reduced));
+
+ Node is_less = Operation(OperationCode::LogicalILessThan, shift, Immediate(32));
+ Node is_zero = Operation(OperationCode::LogicalIEqual, move(shift), Immediate(0));
+
+ Node value = Operation(OperationCode::Select, move(is_less), move(less), move(greater));
+ return Operation(OperationCode::Select, move(is_zero), move(high), move(value));
+}
+
+Node ShiftLeft(Node low, Node high, Node shift, Node low_shift, ShfType type) {
+ // These values are used when the shift value is less than 32
+ Node less_low = Operation(OperationCode::ILogicalShiftRight, low, low_shift);
+ Node less_high = Operation(OperationCode::ILogicalShiftLeft, high, shift);
+ Node less = Operation(OperationCode::IBitwiseOr, move(less_low), move(less_high));
+
+ if (type == ShfType::Bits32) {
+ // On 32 bit shifts we are either full (shifting 32) or shifting less than 32 bits
+ return Operation(OperationCode::Select, IsFull(move(shift)), move(low), move(less));
+ }
+
+ // And these when it's larger than or 32
+ Node reduced = Operation(OperationCode::IAdd, shift, Immediate(-32));
+ Node greater = Shift(OperationCode::ILogicalShiftLeft, move(low), move(reduced));
+
+ Node is_less = Operation(OperationCode::LogicalILessThan, shift, Immediate(32));
+ Node is_zero = Operation(OperationCode::LogicalIEqual, move(shift), Immediate(0));
+
+ Node value = Operation(OperationCode::Select, move(is_less), move(less), move(greater));
+ return Operation(OperationCode::Select, move(is_zero), move(high), move(value));
+}
+
+} // Anonymous namespace
u32 ShaderIR::DecodeShift(NodeBlock& bb, u32 pc) {
const Instruction instr = {program_code[pc]};
@@ -28,29 +100,48 @@ u32 ShaderIR::DecodeShift(NodeBlock& bb, u32 pc) {
}
}();
- switch (opcode->get().GetId()) {
+ switch (const auto opid = opcode->get().GetId(); opid) {
case OpCode::Id::SHR_C:
case OpCode::Id::SHR_R:
case OpCode::Id::SHR_IMM: {
- if (instr.shr.wrap) {
- op_b = Operation(OperationCode::UBitwiseAnd, std::move(op_b), Immediate(0x1f));
- } else {
- op_b = Operation(OperationCode::IMax, std::move(op_b), Immediate(0));
- op_b = Operation(OperationCode::IMin, std::move(op_b), Immediate(31));
- }
+ op_b = instr.shr.wrap ? WrapShift(move(op_b)) : ClampShift(move(op_b));
Node value = SignedOperation(OperationCode::IArithmeticShiftRight, instr.shift.is_signed,
- std::move(op_a), std::move(op_b));
+ move(op_a), move(op_b));
SetInternalFlagsFromInteger(bb, value, instr.generates_cc);
- SetRegister(bb, instr.gpr0, std::move(value));
+ SetRegister(bb, instr.gpr0, move(value));
break;
}
case OpCode::Id::SHL_C:
case OpCode::Id::SHL_R:
case OpCode::Id::SHL_IMM: {
- const Node value = Operation(OperationCode::ILogicalShiftLeft, op_a, op_b);
+ Node value = Operation(OperationCode::ILogicalShiftLeft, op_a, op_b);
SetInternalFlagsFromInteger(bb, value, instr.generates_cc);
- SetRegister(bb, instr.gpr0, value);
+ SetRegister(bb, instr.gpr0, move(value));
+ break;
+ }
+ case OpCode::Id::SHF_RIGHT_R:
+ case OpCode::Id::SHF_RIGHT_IMM:
+ case OpCode::Id::SHF_LEFT_R:
+ case OpCode::Id::SHF_LEFT_IMM: {
+ UNIMPLEMENTED_IF(instr.generates_cc);
+ UNIMPLEMENTED_IF_MSG(instr.shf.xmode != ShfXmode::None, "xmode={}",
+ static_cast<int>(instr.shf.xmode.Value()));
+
+ if (instr.is_b_imm) {
+ op_b = Immediate(static_cast<u32>(instr.shf.immediate));
+ }
+ const s32 size = instr.shf.type == ShfType::Bits32 ? 32 : 64;
+ Node shift = instr.shf.wrap ? WrapShift(move(op_b), size) : ClampShift(move(op_b), size);
+
+ Node negated_shift = Operation(OperationCode::INegate, shift);
+ Node low_shift = Operation(OperationCode::IAdd, move(negated_shift), Immediate(32));
+
+ const bool is_right = opid == OpCode::Id::SHF_RIGHT_R || opid == OpCode::Id::SHF_RIGHT_IMM;
+ Node value = (is_right ? ShiftRight : ShiftLeft)(
+ move(op_a), GetRegister(instr.gpr39), move(shift), move(low_shift), instr.shf.type);
+
+ SetRegister(bb, instr.gpr0, move(value));
break;
}
default: