summaryrefslogtreecommitdiffstats
path: root/src/video_core
diff options
context:
space:
mode:
Diffstat (limited to 'src/video_core')
-rw-r--r--src/video_core/engines/maxwell_3d.h77
-rw-r--r--src/video_core/engines/shader_bytecode.h57
-rw-r--r--src/video_core/renderer_opengl/gl_rasterizer.cpp22
-rw-r--r--src/video_core/renderer_opengl/gl_rasterizer.h3
-rw-r--r--src/video_core/renderer_opengl/gl_shader_decompiler.cpp12
-rw-r--r--src/video_core/renderer_opengl/gl_state_tracker.cpp7
-rw-r--r--src/video_core/renderer_opengl/gl_state_tracker.h1
-rw-r--r--src/video_core/renderer_opengl/gl_texture_cache.cpp4
-rw-r--r--src/video_core/renderer_vulkan/vk_rasterizer.cpp4
-rw-r--r--src/video_core/shader/decode/conversion.cpp113
-rw-r--r--src/video_core/shader/decode/texture.cpp14
-rw-r--r--src/video_core/shader/decode/video.cpp58
-rw-r--r--src/video_core/shader/shader_ir.h3
-rw-r--r--src/video_core/texture_cache/texture_cache.h66
-rw-r--r--src/video_core/textures/astc.cpp241
-rw-r--r--src/video_core/textures/texture.h15
16 files changed, 524 insertions, 173 deletions
diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h
index d24c9f657..2977a7d81 100644
--- a/src/video_core/engines/maxwell_3d.h
+++ b/src/video_core/engines/maxwell_3d.h
@@ -312,6 +312,35 @@ public:
}
};
+ struct MsaaSampleLocation {
+ union {
+ BitField<0, 4, u32> x0;
+ BitField<4, 4, u32> y0;
+ BitField<8, 4, u32> x1;
+ BitField<12, 4, u32> y1;
+ BitField<16, 4, u32> x2;
+ BitField<20, 4, u32> y2;
+ BitField<24, 4, u32> x3;
+ BitField<28, 4, u32> y3;
+ };
+
+ constexpr std::pair<u32, u32> Location(int index) const {
+ switch (index) {
+ case 0:
+ return {x0, y0};
+ case 1:
+ return {x1, y1};
+ case 2:
+ return {x2, y2};
+ case 3:
+ return {x3, y3};
+ default:
+ UNREACHABLE();
+ return {0, 0};
+ }
+ }
+ };
+
enum class DepthMode : u32 {
MinusOneToOne = 0,
ZeroToOne = 1,
@@ -793,7 +822,13 @@ public:
u32 rt_separate_frag_data;
- INSERT_UNION_PADDING_WORDS(0xC);
+ INSERT_UNION_PADDING_WORDS(0x1);
+
+ u32 multisample_raster_enable;
+ u32 multisample_raster_samples;
+ std::array<u32, 4> multisample_sample_mask;
+
+ INSERT_UNION_PADDING_WORDS(0x5);
struct {
u32 address_high;
@@ -830,7 +865,16 @@ public:
std::array<VertexAttribute, NumVertexAttributes> vertex_attrib_format;
- INSERT_UNION_PADDING_WORDS(0xF);
+ std::array<MsaaSampleLocation, 4> multisample_sample_locations;
+
+ INSERT_UNION_PADDING_WORDS(0x2);
+
+ union {
+ BitField<0, 1, u32> enable;
+ BitField<4, 3, u32> target;
+ } multisample_coverage_to_color;
+
+ INSERT_UNION_PADDING_WORDS(0x8);
struct {
union {
@@ -922,7 +966,10 @@ public:
BitField<4, 1, u32> triangle_rast_flip;
} screen_y_control;
- INSERT_UNION_PADDING_WORDS(0x21);
+ float line_width_smooth;
+ float line_width_aliased;
+
+ INSERT_UNION_PADDING_WORDS(0x1F);
u32 vb_element_base;
u32 vb_base_instance;
@@ -943,7 +990,7 @@ public:
CounterReset counter_reset;
- INSERT_UNION_PADDING_WORDS(0x1);
+ u32 multisample_enable;
u32 zeta_enable;
@@ -980,7 +1027,7 @@ public:
float polygon_offset_factor;
- INSERT_UNION_PADDING_WORDS(0x1);
+ u32 line_smooth_enable;
struct {
u32 tic_address_high;
@@ -1007,7 +1054,11 @@ public:
float polygon_offset_units;
- INSERT_UNION_PADDING_WORDS(0x11);
+ INSERT_UNION_PADDING_WORDS(0x4);
+
+ Tegra::Texture::MsaaMode multisample_mode;
+
+ INSERT_UNION_PADDING_WORDS(0xC);
union {
BitField<2, 1, u32> coord_origin;
@@ -1507,12 +1558,17 @@ ASSERT_REG_POSITION(stencil_back_func_ref, 0x3D5);
ASSERT_REG_POSITION(stencil_back_mask, 0x3D6);
ASSERT_REG_POSITION(stencil_back_func_mask, 0x3D7);
ASSERT_REG_POSITION(color_mask_common, 0x3E4);
-ASSERT_REG_POSITION(rt_separate_frag_data, 0x3EB);
ASSERT_REG_POSITION(depth_bounds, 0x3E7);
+ASSERT_REG_POSITION(rt_separate_frag_data, 0x3EB);
+ASSERT_REG_POSITION(multisample_raster_enable, 0x3ED);
+ASSERT_REG_POSITION(multisample_raster_samples, 0x3EE);
+ASSERT_REG_POSITION(multisample_sample_mask, 0x3EF);
ASSERT_REG_POSITION(zeta, 0x3F8);
ASSERT_REG_POSITION(clear_flags, 0x43E);
ASSERT_REG_POSITION(fill_rectangle, 0x44F);
ASSERT_REG_POSITION(vertex_attrib_format, 0x458);
+ASSERT_REG_POSITION(multisample_sample_locations, 0x478);
+ASSERT_REG_POSITION(multisample_coverage_to_color, 0x47E);
ASSERT_REG_POSITION(rt_control, 0x487);
ASSERT_REG_POSITION(zeta_width, 0x48a);
ASSERT_REG_POSITION(zeta_height, 0x48b);
@@ -1538,6 +1594,8 @@ ASSERT_REG_POSITION(stencil_front_func_mask, 0x4E6);
ASSERT_REG_POSITION(stencil_front_mask, 0x4E7);
ASSERT_REG_POSITION(frag_color_clamp, 0x4EA);
ASSERT_REG_POSITION(screen_y_control, 0x4EB);
+ASSERT_REG_POSITION(line_width_smooth, 0x4EC);
+ASSERT_REG_POSITION(line_width_aliased, 0x4ED);
ASSERT_REG_POSITION(vb_element_base, 0x50D);
ASSERT_REG_POSITION(vb_base_instance, 0x50E);
ASSERT_REG_POSITION(clip_distance_enabled, 0x544);
@@ -1545,11 +1603,13 @@ ASSERT_REG_POSITION(samplecnt_enable, 0x545);
ASSERT_REG_POSITION(point_size, 0x546);
ASSERT_REG_POSITION(point_sprite_enable, 0x548);
ASSERT_REG_POSITION(counter_reset, 0x54C);
+ASSERT_REG_POSITION(multisample_enable, 0x54D);
ASSERT_REG_POSITION(zeta_enable, 0x54E);
ASSERT_REG_POSITION(multisample_control, 0x54F);
ASSERT_REG_POSITION(condition, 0x554);
ASSERT_REG_POSITION(tsc, 0x557);
-ASSERT_REG_POSITION(polygon_offset_factor, 0x55b);
+ASSERT_REG_POSITION(polygon_offset_factor, 0x55B);
+ASSERT_REG_POSITION(line_smooth_enable, 0x55C);
ASSERT_REG_POSITION(tic, 0x55D);
ASSERT_REG_POSITION(stencil_two_side_enable, 0x565);
ASSERT_REG_POSITION(stencil_back_op_fail, 0x566);
@@ -1558,6 +1618,7 @@ ASSERT_REG_POSITION(stencil_back_op_zpass, 0x568);
ASSERT_REG_POSITION(stencil_back_func_func, 0x569);
ASSERT_REG_POSITION(framebuffer_srgb, 0x56E);
ASSERT_REG_POSITION(polygon_offset_units, 0x56F);
+ASSERT_REG_POSITION(multisample_mode, 0x574);
ASSERT_REG_POSITION(point_coord_replace, 0x581);
ASSERT_REG_POSITION(code_address, 0x582);
ASSERT_REG_POSITION(draw, 0x585);
diff --git a/src/video_core/engines/shader_bytecode.h b/src/video_core/engines/shader_bytecode.h
index 498936f0c..c66c66f6c 100644
--- a/src/video_core/engines/shader_bytecode.h
+++ b/src/video_core/engines/shader_bytecode.h
@@ -290,6 +290,23 @@ enum class VmadShr : u64 {
Shr15 = 2,
};
+enum class VmnmxType : u64 {
+ Bits8,
+ Bits16,
+ Bits32,
+};
+
+enum class VmnmxOperation : u64 {
+ Mrg_16H = 0,
+ Mrg_16L = 1,
+ Mrg_8B0 = 2,
+ Mrg_8B2 = 3,
+ Acc = 4,
+ Min = 5,
+ Max = 6,
+ Nop = 7,
+};
+
enum class XmadMode : u64 {
None = 0,
CLo = 1,
@@ -1651,6 +1668,42 @@ union Instruction {
} vmad;
union {
+ BitField<54, 1, u64> is_dest_signed;
+ BitField<48, 1, u64> is_src_a_signed;
+ BitField<49, 1, u64> is_src_b_signed;
+ BitField<37, 2, u64> src_format_a;
+ BitField<29, 2, u64> src_format_b;
+ BitField<56, 1, u64> mx;
+ BitField<55, 1, u64> sat;
+ BitField<36, 2, u64> selector_a;
+ BitField<28, 2, u64> selector_b;
+ BitField<50, 1, u64> is_op_b_register;
+ BitField<51, 3, VmnmxOperation> operation;
+
+ VmnmxType SourceFormatA() const {
+ switch (src_format_a) {
+ case 0b11:
+ return VmnmxType::Bits32;
+ case 0b10:
+ return VmnmxType::Bits16;
+ default:
+ return VmnmxType::Bits8;
+ }
+ }
+
+ VmnmxType SourceFormatB() const {
+ switch (src_format_b) {
+ case 0b11:
+ return VmnmxType::Bits32;
+ case 0b10:
+ return VmnmxType::Bits16;
+ default:
+ return VmnmxType::Bits8;
+ }
+ }
+ } vmnmx;
+
+ union {
BitField<20, 16, u64> imm20_16;
BitField<35, 1, u64> high_b_rr; // used on RR
BitField<36, 1, u64> product_shift_left;
@@ -1763,6 +1816,7 @@ public:
MEMBAR,
VMAD,
VSETP,
+ VMNMX,
FFMA_IMM, // Fused Multiply and Add
FFMA_CR,
FFMA_RC,
@@ -2070,6 +2124,7 @@ private:
INST("1110111110011---", Id::MEMBAR, Type::Trivial, "MEMBAR"),
INST("01011111--------", Id::VMAD, Type::Video, "VMAD"),
INST("0101000011110---", Id::VSETP, Type::Video, "VSETP"),
+ INST("0011101---------", Id::VMNMX, Type::Video, "VMNMX"),
INST("0011001-1-------", Id::FFMA_IMM, Type::Ffma, "FFMA_IMM"),
INST("010010011-------", Id::FFMA_CR, Type::Ffma, "FFMA_CR"),
INST("010100011-------", Id::FFMA_RC, Type::Ffma, "FFMA_RC"),
@@ -2170,7 +2225,7 @@ private:
INST("0011011-11111---", Id::SHF_LEFT_IMM, Type::Shift, "SHF_LEFT_IMM"),
INST("0100110011100---", Id::I2I_C, Type::Conversion, "I2I_C"),
INST("0101110011100---", Id::I2I_R, Type::Conversion, "I2I_R"),
- INST("0011101-11100---", Id::I2I_IMM, Type::Conversion, "I2I_IMM"),
+ INST("0011100-11100---", Id::I2I_IMM, Type::Conversion, "I2I_IMM"),
INST("0100110010111---", Id::I2F_C, Type::Conversion, "I2F_C"),
INST("0101110010111---", Id::I2F_R, Type::Conversion, "I2F_R"),
INST("0011100-10111---", Id::I2F_IMM, Type::Conversion, "I2F_IMM"),
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp
index 368f399df..f31d960c7 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -345,7 +345,7 @@ void RasterizerOpenGL::ConfigureFramebuffers() {
texture_cache.GuardRenderTargets(true);
- View depth_surface = texture_cache.GetDepthBufferSurface(true);
+ View depth_surface = texture_cache.GetDepthBufferSurface();
const auto& regs = gpu.regs;
UNIMPLEMENTED_IF(regs.rt_separate_frag_data == 0);
@@ -354,7 +354,7 @@ void RasterizerOpenGL::ConfigureFramebuffers() {
FramebufferCacheKey key;
const auto colors_count = static_cast<std::size_t>(regs.rt_control.count);
for (std::size_t index = 0; index < colors_count; ++index) {
- View color_surface{texture_cache.GetColorBufferSurface(index, true)};
+ View color_surface{texture_cache.GetColorBufferSurface(index)};
if (!color_surface) {
continue;
}
@@ -387,12 +387,12 @@ void RasterizerOpenGL::ConfigureClearFramebuffer(bool using_color_fb, bool using
View color_surface;
if (using_color_fb) {
const std::size_t index = regs.clear_buffers.RT;
- color_surface = texture_cache.GetColorBufferSurface(index, true);
+ color_surface = texture_cache.GetColorBufferSurface(index);
texture_cache.MarkColorBufferInUse(index);
}
View depth_surface;
if (using_depth_fb || using_stencil_fb) {
- depth_surface = texture_cache.GetDepthBufferSurface(true);
+ depth_surface = texture_cache.GetDepthBufferSurface();
texture_cache.MarkDepthBufferInUse();
}
texture_cache.GuardRenderTargets(false);
@@ -496,6 +496,7 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) {
SyncPrimitiveRestart();
SyncScissorTest();
SyncPointState();
+ SyncLineState();
SyncPolygonOffset();
SyncAlphaTest();
SyncFramebufferSRGB();
@@ -1311,6 +1312,19 @@ void RasterizerOpenGL::SyncPointState() {
glDisable(GL_PROGRAM_POINT_SIZE);
}
+void RasterizerOpenGL::SyncLineState() {
+ auto& gpu = system.GPU().Maxwell3D();
+ auto& flags = gpu.dirty.flags;
+ if (!flags[Dirty::LineWidth]) {
+ return;
+ }
+ flags[Dirty::LineWidth] = false;
+
+ const auto& regs = gpu.regs;
+ oglEnable(GL_LINE_SMOOTH, regs.line_smooth_enable);
+ glLineWidth(regs.line_smooth_enable ? regs.line_width_smooth : regs.line_width_aliased);
+}
+
void RasterizerOpenGL::SyncPolygonOffset() {
auto& gpu = system.GPU().Maxwell3D();
auto& flags = gpu.dirty.flags;
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h
index 212dad852..435da4425 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer.h
@@ -171,6 +171,9 @@ private:
/// Syncs the point state to match the guest state
void SyncPointState();
+ /// Syncs the line state to match the guest state
+ void SyncLineState();
+
/// Syncs the rasterizer enable state to match the guest state
void SyncRasterizeEnable();
diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
index 160ae4340..1f1f01313 100644
--- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
@@ -1819,15 +1819,15 @@ private:
}
Expression HMergeH0(Operation operation) {
- std::string dest = VisitOperand(operation, 0).AsUint();
- std::string src = VisitOperand(operation, 1).AsUint();
- return {fmt::format("(({} & 0x0000FFFFU) | ({} & 0xFFFF0000U))", src, dest), Type::Uint};
+ const std::string dest = VisitOperand(operation, 0).AsUint();
+ const std::string src = VisitOperand(operation, 1).AsUint();
+ return {fmt::format("bitfieldInsert({}, {}, 0, 16)", dest, src), Type::Uint};
}
Expression HMergeH1(Operation operation) {
- std::string dest = VisitOperand(operation, 0).AsUint();
- std::string src = VisitOperand(operation, 1).AsUint();
- return {fmt::format("(({} & 0x0000FFFFU) | ({} & 0xFFFF0000U))", dest, src), Type::Uint};
+ const std::string dest = VisitOperand(operation, 0).AsUint();
+ const std::string src = VisitOperand(operation, 1).AsUint();
+ return {fmt::format("bitfieldInsert({}, {}, 16, 16)", dest, src), Type::Uint};
}
Expression HPack2(Operation operation) {
diff --git a/src/video_core/renderer_opengl/gl_state_tracker.cpp b/src/video_core/renderer_opengl/gl_state_tracker.cpp
index 255ac3147..d24fad3de 100644
--- a/src/video_core/renderer_opengl/gl_state_tracker.cpp
+++ b/src/video_core/renderer_opengl/gl_state_tracker.cpp
@@ -185,6 +185,12 @@ void SetupDirtyPointSize(Tables& tables) {
tables[0][OFF(point_sprite_enable)] = PointSize;
}
+void SetupDirtyLineWidth(Tables& tables) {
+ tables[0][OFF(line_width_smooth)] = LineWidth;
+ tables[0][OFF(line_width_aliased)] = LineWidth;
+ tables[0][OFF(line_smooth_enable)] = LineWidth;
+}
+
void SetupDirtyClipControl(Tables& tables) {
auto& table = tables[0];
table[OFF(screen_y_control)] = ClipControl;
@@ -233,6 +239,7 @@ void StateTracker::Initialize() {
SetupDirtyLogicOp(tables);
SetupDirtyFragmentClampColor(tables);
SetupDirtyPointSize(tables);
+ SetupDirtyLineWidth(tables);
SetupDirtyClipControl(tables);
SetupDirtyDepthClampEnabled(tables);
SetupDirtyMisc(tables);
diff --git a/src/video_core/renderer_opengl/gl_state_tracker.h b/src/video_core/renderer_opengl/gl_state_tracker.h
index b882d75c3..0f823288e 100644
--- a/src/video_core/renderer_opengl/gl_state_tracker.h
+++ b/src/video_core/renderer_opengl/gl_state_tracker.h
@@ -78,6 +78,7 @@ enum : u8 {
LogicOp,
FragmentClampColor,
PointSize,
+ LineWidth,
ClipControl,
DepthClampEnabled,
diff --git a/src/video_core/renderer_opengl/gl_texture_cache.cpp b/src/video_core/renderer_opengl/gl_texture_cache.cpp
index 36590a6d0..0b4d999d7 100644
--- a/src/video_core/renderer_opengl/gl_texture_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp
@@ -411,14 +411,13 @@ CachedSurfaceView::~CachedSurfaceView() = default;
void CachedSurfaceView::Attach(GLenum attachment, GLenum target) const {
ASSERT(params.num_levels == 1);
- const GLuint texture = surface.GetTexture();
if (params.num_layers > 1) {
// Layered framebuffer attachments
UNIMPLEMENTED_IF(params.base_layer != 0);
switch (params.target) {
case SurfaceTarget::Texture2DArray:
- glFramebufferTexture(target, attachment, texture, params.base_level);
+ glFramebufferTexture(target, attachment, GetTexture(), params.base_level);
break;
default:
UNIMPLEMENTED();
@@ -427,6 +426,7 @@ void CachedSurfaceView::Attach(GLenum attachment, GLenum target) const {
}
const GLenum view_target = surface.GetTarget();
+ const GLuint texture = surface.GetTexture();
switch (surface.GetSurfaceParams().target) {
case SurfaceTarget::Texture1D:
glFramebufferTexture1D(target, attachment, view_target, texture, params.base_level);
diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
index af9420c19..33cbc0bb6 100644
--- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp
+++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
@@ -609,7 +609,7 @@ RasterizerVulkan::Texceptions RasterizerVulkan::UpdateAttachments() {
Texceptions texceptions;
for (std::size_t rt = 0; rt < Maxwell::NumRenderTargets; ++rt) {
if (update_rendertargets) {
- color_attachments[rt] = texture_cache.GetColorBufferSurface(rt, true);
+ color_attachments[rt] = texture_cache.GetColorBufferSurface(rt);
}
if (color_attachments[rt] && WalkAttachmentOverlaps(*color_attachments[rt])) {
texceptions[rt] = true;
@@ -617,7 +617,7 @@ RasterizerVulkan::Texceptions RasterizerVulkan::UpdateAttachments() {
}
if (update_rendertargets) {
- zeta_attachment = texture_cache.GetDepthBufferSurface(true);
+ zeta_attachment = texture_cache.GetDepthBufferSurface();
}
if (zeta_attachment && WalkAttachmentOverlaps(*zeta_attachment)) {
texceptions[ZETA_TEXCEPTION_INDEX] = true;
diff --git a/src/video_core/shader/decode/conversion.cpp b/src/video_core/shader/decode/conversion.cpp
index c72690b2b..b9989c88c 100644
--- a/src/video_core/shader/decode/conversion.cpp
+++ b/src/video_core/shader/decode/conversion.cpp
@@ -2,6 +2,10 @@
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
+#include <limits>
+#include <optional>
+#include <utility>
+
#include "common/assert.h"
#include "common/common_types.h"
#include "video_core/engines/shader_bytecode.h"
@@ -15,9 +19,49 @@ using Tegra::Shader::OpCode;
using Tegra::Shader::Register;
namespace {
+
constexpr OperationCode GetFloatSelector(u64 selector) {
return selector == 0 ? OperationCode::FCastHalf0 : OperationCode::FCastHalf1;
}
+
+constexpr u32 SizeInBits(Register::Size size) {
+ switch (size) {
+ case Register::Size::Byte:
+ return 8;
+ case Register::Size::Short:
+ return 16;
+ case Register::Size::Word:
+ return 32;
+ case Register::Size::Long:
+ return 64;
+ }
+ return 0;
+}
+
+constexpr std::optional<std::pair<s32, s32>> IntegerSaturateBounds(Register::Size src_size,
+ Register::Size dst_size,
+ bool src_signed,
+ bool dst_signed) {
+ const u32 dst_bits = SizeInBits(dst_size);
+ if (src_size == Register::Size::Word && dst_size == Register::Size::Word) {
+ if (src_signed == dst_signed) {
+ return std::nullopt;
+ }
+ return std::make_pair(0, std::numeric_limits<s32>::max());
+ }
+ if (dst_signed) {
+ // Signed destination, clamp to [-128, 127] for instance
+ return std::make_pair(-(1 << (dst_bits - 1)), (1 << (dst_bits - 1)) - 1);
+ } else {
+ // Unsigned destination
+ if (dst_bits == 32) {
+ // Avoid shifting by 32, that is undefined behavior
+ return std::make_pair(0, s32(std::numeric_limits<u32>::max()));
+ }
+ return std::make_pair(0, (1 << dst_bits) - 1);
+ }
+}
+
} // Anonymous namespace
u32 ShaderIR::DecodeConversion(NodeBlock& bb, u32 pc) {
@@ -28,14 +72,13 @@ u32 ShaderIR::DecodeConversion(NodeBlock& bb, u32 pc) {
case OpCode::Id::I2I_R:
case OpCode::Id::I2I_C:
case OpCode::Id::I2I_IMM: {
- UNIMPLEMENTED_IF(instr.conversion.int_src.selector != 0);
- UNIMPLEMENTED_IF(instr.conversion.dst_size != Register::Size::Word);
- UNIMPLEMENTED_IF(instr.alu.saturate_d);
+ const bool src_signed = instr.conversion.is_input_signed;
+ const bool dst_signed = instr.conversion.is_output_signed;
+ const Register::Size src_size = instr.conversion.src_size;
+ const Register::Size dst_size = instr.conversion.dst_size;
+ const u32 selector = static_cast<u32>(instr.conversion.int_src.selector);
- const bool input_signed = instr.conversion.is_input_signed;
- const bool output_signed = instr.conversion.is_output_signed;
-
- Node value = [&]() {
+ Node value = [this, instr, opcode] {
switch (opcode->get().GetId()) {
case OpCode::Id::I2I_R:
return GetRegister(instr.gpr20);
@@ -48,16 +91,60 @@ u32 ShaderIR::DecodeConversion(NodeBlock& bb, u32 pc) {
return Immediate(0);
}
}();
- value = ConvertIntegerSize(value, instr.conversion.src_size, input_signed);
- value = GetOperandAbsNegInteger(value, instr.conversion.abs_a, instr.conversion.negate_a,
- input_signed);
- if (input_signed != output_signed) {
- value = SignedOperation(OperationCode::ICastUnsigned, output_signed, NO_PRECISE, value);
+ // Ensure the source selector is valid
+ switch (instr.conversion.src_size) {
+ case Register::Size::Byte:
+ break;
+ case Register::Size::Short:
+ ASSERT(selector == 0 || selector == 2);
+ break;
+ default:
+ ASSERT(selector == 0);
+ break;
+ }
+
+ if (src_size != Register::Size::Word || selector != 0) {
+ value = SignedOperation(OperationCode::IBitfieldExtract, src_signed, std::move(value),
+ Immediate(selector * 8), Immediate(SizeInBits(src_size)));
+ }
+
+ value = GetOperandAbsNegInteger(std::move(value), instr.conversion.abs_a,
+ instr.conversion.negate_a, src_signed);
+
+ if (instr.alu.saturate_d) {
+ if (src_signed && !dst_signed) {
+ Node is_negative = Operation(OperationCode::LogicalUGreaterEqual, value,
+ Immediate(1 << (SizeInBits(src_size) - 1)));
+ value = Operation(OperationCode::Select, std::move(is_negative), Immediate(0),
+ std::move(value));
+
+ // Simplify generated expressions, this can be removed without semantic impact
+ SetTemporary(bb, 0, std::move(value));
+ value = GetTemporary(0);
+
+ if (dst_size != Register::Size::Word) {
+ const Node limit = Immediate((1 << SizeInBits(dst_size)) - 1);
+ Node is_large =
+ Operation(OperationCode::LogicalUGreaterThan, std::move(value), limit);
+ value = Operation(OperationCode::Select, std::move(is_large), limit,
+ std::move(value));
+ }
+ } else if (const std::optional bounds =
+ IntegerSaturateBounds(src_size, dst_size, src_signed, dst_signed)) {
+ value = SignedOperation(OperationCode::IMax, src_signed, std::move(value),
+ Immediate(bounds->first));
+ value = SignedOperation(OperationCode::IMin, src_signed, std::move(value),
+ Immediate(bounds->second));
+ }
+ } else if (dst_size != Register::Size::Word) {
+ // No saturation, we only have to mask the result
+ Node mask = Immediate((1 << SizeInBits(dst_size)) - 1);
+ value = Operation(OperationCode::UBitwiseAnd, std::move(value), std::move(mask));
}
SetInternalFlagsFromInteger(bb, value, instr.generates_cc);
- SetRegister(bb, instr.gpr0, value);
+ SetRegister(bb, instr.gpr0, std::move(value));
break;
}
case OpCode::Id::I2F_R:
diff --git a/src/video_core/shader/decode/texture.cpp b/src/video_core/shader/decode/texture.cpp
index 48350e042..6c4a1358b 100644
--- a/src/video_core/shader/decode/texture.cpp
+++ b/src/video_core/shader/decode/texture.cpp
@@ -780,20 +780,6 @@ Node4 ShaderIR::GetTldsCode(Instruction instr, TextureType texture_type, bool is
// When lod is used always is in gpr20
const Node lod = lod_enabled ? GetRegister(instr.gpr20) : Immediate(0);
- // Fill empty entries from the guest sampler
- const std::size_t entry_coord_count = GetCoordCount(sampler.GetType());
- if (type_coord_count != entry_coord_count) {
- LOG_WARNING(HW_GPU, "Bound and built texture types mismatch");
-
- // When the size is higher we insert zeroes
- for (std::size_t i = type_coord_count; i < entry_coord_count; ++i) {
- coords.push_back(GetRegister(Register::ZeroIndex));
- }
-
- // Then we ensure the size matches the number of entries (dropping unused values)
- coords.resize(entry_coord_count);
- }
-
Node4 values;
for (u32 element = 0; element < values.size(); ++element) {
auto coords_copy = coords;
diff --git a/src/video_core/shader/decode/video.cpp b/src/video_core/shader/decode/video.cpp
index b047cf870..64ba60ea2 100644
--- a/src/video_core/shader/decode/video.cpp
+++ b/src/video_core/shader/decode/video.cpp
@@ -10,16 +10,24 @@
namespace VideoCommon::Shader {
+using std::move;
using Tegra::Shader::Instruction;
using Tegra::Shader::OpCode;
using Tegra::Shader::Pred;
using Tegra::Shader::VideoType;
using Tegra::Shader::VmadShr;
+using Tegra::Shader::VmnmxOperation;
+using Tegra::Shader::VmnmxType;
u32 ShaderIR::DecodeVideo(NodeBlock& bb, u32 pc) {
const Instruction instr = {program_code[pc]};
const auto opcode = OpCode::Decode(instr);
+ if (opcode->get().GetId() == OpCode::Id::VMNMX) {
+ DecodeVMNMX(bb, instr);
+ return pc;
+ }
+
const Node op_a =
GetVideoOperand(GetRegister(instr.gpr8), instr.video.is_byte_chunk_a, instr.video.signed_a,
instr.video.type_a, instr.video.byte_height_a);
@@ -109,4 +117,54 @@ Node ShaderIR::GetVideoOperand(Node op, bool is_chunk, bool is_signed,
}
}
+void ShaderIR::DecodeVMNMX(NodeBlock& bb, Tegra::Shader::Instruction instr) {
+ UNIMPLEMENTED_IF(!instr.vmnmx.is_op_b_register);
+ UNIMPLEMENTED_IF(instr.vmnmx.SourceFormatA() != VmnmxType::Bits32);
+ UNIMPLEMENTED_IF(instr.vmnmx.SourceFormatB() != VmnmxType::Bits32);
+ UNIMPLEMENTED_IF(instr.vmnmx.is_src_a_signed != instr.vmnmx.is_src_b_signed);
+ UNIMPLEMENTED_IF(instr.vmnmx.sat);
+ UNIMPLEMENTED_IF(instr.generates_cc);
+
+ Node op_a = GetRegister(instr.gpr8);
+ Node op_b = GetRegister(instr.gpr20);
+ Node op_c = GetRegister(instr.gpr39);
+
+ const bool is_oper1_signed = instr.vmnmx.is_src_a_signed; // Stubbed
+ const bool is_oper2_signed = instr.vmnmx.is_dest_signed;
+
+ const auto operation_a = instr.vmnmx.mx ? OperationCode::IMax : OperationCode::IMin;
+ Node value = SignedOperation(operation_a, is_oper1_signed, move(op_a), move(op_b));
+
+ switch (instr.vmnmx.operation) {
+ case VmnmxOperation::Mrg_16H:
+ value = BitfieldInsert(move(op_c), move(value), 16, 16);
+ break;
+ case VmnmxOperation::Mrg_16L:
+ value = BitfieldInsert(move(op_c), move(value), 0, 16);
+ break;
+ case VmnmxOperation::Mrg_8B0:
+ value = BitfieldInsert(move(op_c), move(value), 0, 8);
+ break;
+ case VmnmxOperation::Mrg_8B2:
+ value = BitfieldInsert(move(op_c), move(value), 16, 8);
+ break;
+ case VmnmxOperation::Acc:
+ value = Operation(OperationCode::IAdd, move(value), move(op_c));
+ break;
+ case VmnmxOperation::Min:
+ value = SignedOperation(OperationCode::IMin, is_oper2_signed, move(value), move(op_c));
+ break;
+ case VmnmxOperation::Max:
+ value = SignedOperation(OperationCode::IMax, is_oper2_signed, move(value), move(op_c));
+ break;
+ case VmnmxOperation::Nop:
+ break;
+ default:
+ UNREACHABLE();
+ break;
+ }
+
+ SetRegister(bb, instr.gpr0, move(value));
+}
+
} // namespace VideoCommon::Shader
diff --git a/src/video_core/shader/shader_ir.h b/src/video_core/shader/shader_ir.h
index ca6c976c9..c6e7bdf50 100644
--- a/src/video_core/shader/shader_ir.h
+++ b/src/video_core/shader/shader_ir.h
@@ -354,6 +354,9 @@ private:
/// Marks the usage of a input or output attribute.
void MarkAttributeUsage(Tegra::Shader::Attribute::Index index, u64 element);
+ /// Decodes VMNMX instruction and inserts its code into the passed basic block.
+ void DecodeVMNMX(NodeBlock& bb, Tegra::Shader::Instruction instr);
+
void WriteTexInstructionFloat(NodeBlock& bb, Tegra::Shader::Instruction instr,
const Node4& components);
diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h
index 88fe3e25f..cfc7fe6e9 100644
--- a/src/video_core/texture_cache/texture_cache.h
+++ b/src/video_core/texture_cache/texture_cache.h
@@ -108,7 +108,7 @@ public:
}
const auto params{SurfaceParams::CreateForTexture(format_lookup_table, tic, entry)};
- const auto [surface, view] = GetSurface(gpu_addr, *cpu_addr, params, true, false);
+ const auto [surface, view] = GetSurface(gpu_addr, *cpu_addr, params, false);
if (guard_samplers) {
sampled_textures.push_back(surface);
}
@@ -128,7 +128,7 @@ public:
return GetNullSurface(SurfaceParams::ExpectedTarget(entry));
}
const auto params{SurfaceParams::CreateForImage(format_lookup_table, tic, entry)};
- const auto [surface, view] = GetSurface(gpu_addr, *cpu_addr, params, true, false);
+ const auto [surface, view] = GetSurface(gpu_addr, *cpu_addr, params, false);
if (guard_samplers) {
sampled_textures.push_back(surface);
}
@@ -143,7 +143,7 @@ public:
return any_rt;
}
- TView GetDepthBufferSurface(bool preserve_contents) {
+ TView GetDepthBufferSurface() {
std::lock_guard lock{mutex};
auto& maxwell3d = system.GPU().Maxwell3D();
if (!maxwell3d.dirty.flags[VideoCommon::Dirty::ZetaBuffer]) {
@@ -164,7 +164,7 @@ public:
return {};
}
const auto depth_params{SurfaceParams::CreateForDepthBuffer(system)};
- auto surface_view = GetSurface(gpu_addr, *cpu_addr, depth_params, preserve_contents, true);
+ auto surface_view = GetSurface(gpu_addr, *cpu_addr, depth_params, true);
if (depth_buffer.target)
depth_buffer.target->MarkAsRenderTarget(false, NO_RT);
depth_buffer.target = surface_view.first;
@@ -174,7 +174,7 @@ public:
return surface_view.second;
}
- TView GetColorBufferSurface(std::size_t index, bool preserve_contents) {
+ TView GetColorBufferSurface(std::size_t index) {
std::lock_guard lock{mutex};
ASSERT(index < Tegra::Engines::Maxwell3D::Regs::NumRenderTargets);
auto& maxwell3d = system.GPU().Maxwell3D();
@@ -204,9 +204,8 @@ public:
return {};
}
- auto surface_view =
- GetSurface(gpu_addr, *cpu_addr, SurfaceParams::CreateForFramebuffer(system, index),
- preserve_contents, true);
+ auto surface_view = GetSurface(gpu_addr, *cpu_addr,
+ SurfaceParams::CreateForFramebuffer(system, index), true);
if (render_targets[index].target)
render_targets[index].target->MarkAsRenderTarget(false, NO_RT);
render_targets[index].target = surface_view.first;
@@ -260,9 +259,9 @@ public:
const std::optional<VAddr> src_cpu_addr =
system.GPU().MemoryManager().GpuToCpuAddress(src_gpu_addr);
std::pair<TSurface, TView> dst_surface =
- GetSurface(dst_gpu_addr, *dst_cpu_addr, dst_params, true, false);
+ GetSurface(dst_gpu_addr, *dst_cpu_addr, dst_params, false);
std::pair<TSurface, TView> src_surface =
- GetSurface(src_gpu_addr, *src_cpu_addr, src_params, true, false);
+ GetSurface(src_gpu_addr, *src_cpu_addr, src_params, false);
ImageBlit(src_surface.second, dst_surface.second, copy_config);
dst_surface.first->MarkAsModified(true, Tick());
}
@@ -451,22 +450,18 @@ private:
* @param overlaps The overlapping surfaces registered in the cache.
* @param params The parameters for the new surface.
* @param gpu_addr The starting address of the new surface.
- * @param preserve_contents Indicates that the new surface should be loaded from memory or left
- * blank.
* @param untopological Indicates to the recycler that the texture has no way to match the
* overlaps due to topological reasons.
**/
std::pair<TSurface, TView> RecycleSurface(std::vector<TSurface>& overlaps,
const SurfaceParams& params, const GPUVAddr gpu_addr,
- const bool preserve_contents,
const MatchTopologyResult untopological) {
- const bool do_load = preserve_contents && Settings::values.use_accurate_gpu_emulation;
for (auto& surface : overlaps) {
Unregister(surface);
}
switch (PickStrategy(overlaps, params, gpu_addr, untopological)) {
case RecycleStrategy::Ignore: {
- return InitializeSurface(gpu_addr, params, do_load);
+ return InitializeSurface(gpu_addr, params, Settings::values.use_accurate_gpu_emulation);
}
case RecycleStrategy::Flush: {
std::sort(overlaps.begin(), overlaps.end(),
@@ -476,7 +471,7 @@ private:
for (auto& surface : overlaps) {
FlushSurface(surface);
}
- return InitializeSurface(gpu_addr, params, preserve_contents);
+ return InitializeSurface(gpu_addr, params);
}
case RecycleStrategy::BufferCopy: {
auto new_surface = GetUncachedSurface(gpu_addr, params);
@@ -485,7 +480,7 @@ private:
}
default: {
UNIMPLEMENTED_MSG("Unimplemented Texture Cache Recycling Strategy!");
- return InitializeSurface(gpu_addr, params, do_load);
+ return InitializeSurface(gpu_addr, params);
}
}
}
@@ -621,14 +616,11 @@ private:
* @param params The parameters on the new surface.
* @param gpu_addr The starting address of the new surface.
* @param cache_addr The starting address of the new surface on physical memory.
- * @param preserve_contents Indicates that the new surface should be loaded from memory or
- * left blank.
*/
std::optional<std::pair<TSurface, TView>> Manage3DSurfaces(std::vector<TSurface>& overlaps,
const SurfaceParams& params,
const GPUVAddr gpu_addr,
- const VAddr cpu_addr,
- bool preserve_contents) {
+ const VAddr cpu_addr) {
if (params.target == SurfaceTarget::Texture3D) {
bool failed = false;
if (params.num_levels > 1) {
@@ -677,7 +669,7 @@ private:
return std::nullopt;
}
Unregister(surface);
- return InitializeSurface(gpu_addr, params, preserve_contents);
+ return InitializeSurface(gpu_addr, params);
}
return std::nullopt;
}
@@ -688,7 +680,7 @@ private:
return {{surface, surface->GetMainView()}};
}
}
- return InitializeSurface(gpu_addr, params, preserve_contents);
+ return InitializeSurface(gpu_addr, params);
}
}
@@ -711,13 +703,10 @@ private:
*
* @param gpu_addr The starting address of the candidate surface.
* @param params The parameters on the candidate surface.
- * @param preserve_contents Indicates that the new surface should be loaded from memory or
- * left blank.
* @param is_render Whether or not the surface is a render target.
**/
std::pair<TSurface, TView> GetSurface(const GPUVAddr gpu_addr, const VAddr cpu_addr,
- const SurfaceParams& params, bool preserve_contents,
- bool is_render) {
+ const SurfaceParams& params, bool is_render) {
// Step 1
// Check Level 1 Cache for a fast structural match. If candidate surface
// matches at certain level we are pretty much done.
@@ -726,8 +715,7 @@ private:
const auto topological_result = current_surface->MatchesTopology(params);
if (topological_result != MatchTopologyResult::FullMatch) {
std::vector<TSurface> overlaps{current_surface};
- return RecycleSurface(overlaps, params, gpu_addr, preserve_contents,
- topological_result);
+ return RecycleSurface(overlaps, params, gpu_addr, topological_result);
}
const auto struct_result = current_surface->MatchesStructure(params);
@@ -752,7 +740,7 @@ private:
// If none are found, we are done. we just load the surface and create it.
if (overlaps.empty()) {
- return InitializeSurface(gpu_addr, params, preserve_contents);
+ return InitializeSurface(gpu_addr, params);
}
// Step 3
@@ -762,15 +750,13 @@ private:
for (const auto& surface : overlaps) {
const auto topological_result = surface->MatchesTopology(params);
if (topological_result != MatchTopologyResult::FullMatch) {
- return RecycleSurface(overlaps, params, gpu_addr, preserve_contents,
- topological_result);
+ return RecycleSurface(overlaps, params, gpu_addr, topological_result);
}
}
// Check if it's a 3D texture
if (params.block_depth > 0) {
- auto surface =
- Manage3DSurfaces(overlaps, params, gpu_addr, cpu_addr, preserve_contents);
+ auto surface = Manage3DSurfaces(overlaps, params, gpu_addr, cpu_addr);
if (surface) {
return *surface;
}
@@ -790,8 +776,7 @@ private:
return *view;
}
}
- return RecycleSurface(overlaps, params, gpu_addr, preserve_contents,
- MatchTopologyResult::FullMatch);
+ return RecycleSurface(overlaps, params, gpu_addr, MatchTopologyResult::FullMatch);
}
// Now we check if the candidate is a mipmap/layer of the overlap
std::optional<TView> view =
@@ -815,7 +800,7 @@ private:
pair.first->EmplaceView(params, gpu_addr, candidate_size);
if (mirage_view)
return {pair.first, *mirage_view};
- return RecycleSurface(overlaps, params, gpu_addr, preserve_contents,
+ return RecycleSurface(overlaps, params, gpu_addr,
MatchTopologyResult::FullMatch);
}
return {current_surface, *view};
@@ -831,8 +816,7 @@ private:
}
}
// We failed all the tests, recycle the overlaps into a new texture.
- return RecycleSurface(overlaps, params, gpu_addr, preserve_contents,
- MatchTopologyResult::FullMatch);
+ return RecycleSurface(overlaps, params, gpu_addr, MatchTopologyResult::FullMatch);
}
/**
@@ -990,10 +974,10 @@ private:
}
std::pair<TSurface, TView> InitializeSurface(GPUVAddr gpu_addr, const SurfaceParams& params,
- bool preserve_contents) {
+ bool do_load = true) {
auto new_surface{GetUncachedSurface(gpu_addr, params)};
Register(new_surface);
- if (preserve_contents) {
+ if (do_load) {
LoadSurface(new_surface);
}
return {new_surface, new_surface->GetMainView()};
diff --git a/src/video_core/textures/astc.cpp b/src/video_core/textures/astc.cpp
index 062b4f252..365bde2f1 100644
--- a/src/video_core/textures/astc.cpp
+++ b/src/video_core/textures/astc.cpp
@@ -20,6 +20,8 @@
#include <cstring>
#include <vector>
+#include <boost/container/static_vector.hpp>
+
#include "common/common_types.h"
#include "video_core/textures/astc.h"
@@ -39,25 +41,25 @@ constexpr u32 Popcnt(u32 n) {
class InputBitStream {
public:
- explicit InputBitStream(const u8* ptr, std::size_t start_offset = 0)
- : m_CurByte(ptr), m_NextBit(start_offset % 8) {}
+ constexpr explicit InputBitStream(const u8* ptr, std::size_t start_offset = 0)
+ : cur_byte{ptr}, next_bit{start_offset % 8} {}
- std::size_t GetBitsRead() const {
- return m_BitsRead;
+ constexpr std::size_t GetBitsRead() const {
+ return bits_read;
}
- u32 ReadBit() {
- u32 bit = *m_CurByte >> m_NextBit++;
- while (m_NextBit >= 8) {
- m_NextBit -= 8;
- m_CurByte++;
+ constexpr bool ReadBit() {
+ const bool bit = (*cur_byte >> next_bit++) & 1;
+ while (next_bit >= 8) {
+ next_bit -= 8;
+ cur_byte++;
}
- m_BitsRead++;
- return bit & 1;
+ bits_read++;
+ return bit;
}
- u32 ReadBits(std::size_t nBits) {
+ constexpr u32 ReadBits(std::size_t nBits) {
u32 ret = 0;
for (std::size_t i = 0; i < nBits; ++i) {
ret |= (ReadBit() & 1) << i;
@@ -66,7 +68,7 @@ public:
}
template <std::size_t nBits>
- u32 ReadBits() {
+ constexpr u32 ReadBits() {
u32 ret = 0;
for (std::size_t i = 0; i < nBits; ++i) {
ret |= (ReadBit() & 1) << i;
@@ -75,64 +77,58 @@ public:
}
private:
- const u8* m_CurByte;
- std::size_t m_NextBit = 0;
- std::size_t m_BitsRead = 0;
+ const u8* cur_byte;
+ std::size_t next_bit = 0;
+ std::size_t bits_read = 0;
};
class OutputBitStream {
public:
- explicit OutputBitStream(u8* ptr, s32 nBits = 0, s32 start_offset = 0)
- : m_NumBits(nBits), m_CurByte(ptr), m_NextBit(start_offset % 8) {}
-
- ~OutputBitStream() = default;
+ constexpr explicit OutputBitStream(u8* ptr, std::size_t bits = 0, std::size_t start_offset = 0)
+ : cur_byte{ptr}, num_bits{bits}, next_bit{start_offset % 8} {}
- s32 GetBitsWritten() const {
- return m_BitsWritten;
+ constexpr std::size_t GetBitsWritten() const {
+ return bits_written;
}
- void WriteBitsR(u32 val, u32 nBits) {
+ constexpr void WriteBitsR(u32 val, u32 nBits) {
for (u32 i = 0; i < nBits; i++) {
WriteBit((val >> (nBits - i - 1)) & 1);
}
}
- void WriteBits(u32 val, u32 nBits) {
+ constexpr void WriteBits(u32 val, u32 nBits) {
for (u32 i = 0; i < nBits; i++) {
WriteBit((val >> i) & 1);
}
}
private:
- void WriteBit(s32 b) {
-
- if (done)
+ constexpr void WriteBit(bool b) {
+ if (bits_written >= num_bits) {
return;
+ }
- const u32 mask = 1 << m_NextBit++;
+ const u32 mask = 1 << next_bit++;
// clear the bit
- *m_CurByte &= static_cast<u8>(~mask);
+ *cur_byte &= static_cast<u8>(~mask);
// Write the bit, if necessary
if (b)
- *m_CurByte |= static_cast<u8>(mask);
+ *cur_byte |= static_cast<u8>(mask);
// Next byte?
- if (m_NextBit >= 8) {
- m_CurByte += 1;
- m_NextBit = 0;
+ if (next_bit >= 8) {
+ cur_byte += 1;
+ next_bit = 0;
}
-
- done = done || ++m_BitsWritten >= m_NumBits;
}
- s32 m_BitsWritten = 0;
- const s32 m_NumBits;
- u8* m_CurByte;
- s32 m_NextBit = 0;
-
- bool done = false;
+ u8* cur_byte;
+ std::size_t num_bits;
+ std::size_t bits_written = 0;
+ std::size_t next_bit = 0;
};
template <typename IntType>
@@ -195,9 +191,13 @@ struct IntegerEncodedValue {
u32 trit_value;
};
};
+using IntegerEncodedVector = boost::container::static_vector<
+ IntegerEncodedValue, 64,
+ boost::container::static_vector_options<
+ boost::container::inplace_alignment<alignof(IntegerEncodedValue)>,
+ boost::container::throw_on_overflow<false>>::type>;
-static void DecodeTritBlock(InputBitStream& bits, std::vector<IntegerEncodedValue>& result,
- u32 nBitsPerValue) {
+static void DecodeTritBlock(InputBitStream& bits, IntegerEncodedVector& result, u32 nBitsPerValue) {
// Implement the algorithm in section C.2.12
u32 m[5];
u32 t[5];
@@ -255,7 +255,7 @@ static void DecodeTritBlock(InputBitStream& bits, std::vector<IntegerEncodedValu
}
}
-static void DecodeQus32Block(InputBitStream& bits, std::vector<IntegerEncodedValue>& result,
+static void DecodeQus32Block(InputBitStream& bits, IntegerEncodedVector& result,
u32 nBitsPerValue) {
// Implement the algorithm in section C.2.12
u32 m[3];
@@ -343,8 +343,8 @@ static constexpr std::array EncodingsValues = MakeEncodedValues();
// Fills result with the values that are encoded in the given
// bitstream. We must know beforehand what the maximum possible
// value is, and how many values we're decoding.
-static void DecodeIntegerSequence(std::vector<IntegerEncodedValue>& result, InputBitStream& bits,
- u32 maxRange, u32 nValues) {
+static void DecodeIntegerSequence(IntegerEncodedVector& result, InputBitStream& bits, u32 maxRange,
+ u32 nValues) {
// Determine encoding parameters
IntegerEncodedValue val = EncodingsValues[maxRange];
@@ -634,12 +634,14 @@ static void FillError(u32* outBuf, u32 blockWidth, u32 blockHeight) {
// Replicates low numBits such that [(toBit - 1):(toBit - 1 - fromBit)]
// is the same as [(numBits - 1):0] and repeats all the way down.
template <typename IntType>
-static IntType Replicate(IntType val, u32 numBits, u32 toBit) {
- if (numBits == 0)
+static constexpr IntType Replicate(IntType val, u32 numBits, u32 toBit) {
+ if (numBits == 0) {
return 0;
- if (toBit == 0)
+ }
+ if (toBit == 0) {
return 0;
- IntType v = val & static_cast<IntType>((1 << numBits) - 1);
+ }
+ const IntType v = val & static_cast<IntType>((1 << numBits) - 1);
IntType res = v;
u32 reslen = numBits;
while (reslen < toBit) {
@@ -656,6 +658,89 @@ static IntType Replicate(IntType val, u32 numBits, u32 toBit) {
return res;
}
+static constexpr std::size_t NumReplicateEntries(u32 num_bits) {
+ return std::size_t(1) << num_bits;
+}
+
+template <typename IntType, u32 num_bits, u32 to_bit>
+static constexpr auto MakeReplicateTable() {
+ std::array<IntType, NumReplicateEntries(num_bits)> table{};
+ for (IntType value = 0; value < static_cast<IntType>(std::size(table)); ++value) {
+ table[value] = Replicate(value, num_bits, to_bit);
+ }
+ return table;
+}
+
+static constexpr auto REPLICATE_BYTE_TO_16_TABLE = MakeReplicateTable<u32, 8, 16>();
+static constexpr u32 ReplicateByteTo16(std::size_t value) {
+ return REPLICATE_BYTE_TO_16_TABLE[value];
+}
+
+static constexpr auto REPLICATE_BIT_TO_7_TABLE = MakeReplicateTable<u32, 1, 7>();
+static constexpr u32 ReplicateBitTo7(std::size_t value) {
+ return REPLICATE_BIT_TO_7_TABLE[value];
+}
+
+static constexpr auto REPLICATE_BIT_TO_9_TABLE = MakeReplicateTable<u32, 1, 9>();
+static constexpr u32 ReplicateBitTo9(std::size_t value) {
+ return REPLICATE_BIT_TO_9_TABLE[value];
+}
+
+static constexpr auto REPLICATE_1_BIT_TO_8_TABLE = MakeReplicateTable<u32, 1, 8>();
+static constexpr auto REPLICATE_2_BIT_TO_8_TABLE = MakeReplicateTable<u32, 2, 8>();
+static constexpr auto REPLICATE_3_BIT_TO_8_TABLE = MakeReplicateTable<u32, 3, 8>();
+static constexpr auto REPLICATE_4_BIT_TO_8_TABLE = MakeReplicateTable<u32, 4, 8>();
+static constexpr auto REPLICATE_5_BIT_TO_8_TABLE = MakeReplicateTable<u32, 5, 8>();
+static constexpr auto REPLICATE_6_BIT_TO_8_TABLE = MakeReplicateTable<u32, 6, 8>();
+static constexpr auto REPLICATE_7_BIT_TO_8_TABLE = MakeReplicateTable<u32, 7, 8>();
+static constexpr auto REPLICATE_8_BIT_TO_8_TABLE = MakeReplicateTable<u32, 8, 8>();
+/// Use a precompiled table with the most common usages, if it's not in the expected range, fallback
+/// to the runtime implementation
+static constexpr u32 FastReplicateTo8(u32 value, u32 num_bits) {
+ switch (num_bits) {
+ case 1:
+ return REPLICATE_1_BIT_TO_8_TABLE[value];
+ case 2:
+ return REPLICATE_2_BIT_TO_8_TABLE[value];
+ case 3:
+ return REPLICATE_3_BIT_TO_8_TABLE[value];
+ case 4:
+ return REPLICATE_4_BIT_TO_8_TABLE[value];
+ case 5:
+ return REPLICATE_5_BIT_TO_8_TABLE[value];
+ case 6:
+ return REPLICATE_6_BIT_TO_8_TABLE[value];
+ case 7:
+ return REPLICATE_7_BIT_TO_8_TABLE[value];
+ case 8:
+ return REPLICATE_8_BIT_TO_8_TABLE[value];
+ default:
+ return Replicate(value, num_bits, 8);
+ }
+}
+
+static constexpr auto REPLICATE_1_BIT_TO_6_TABLE = MakeReplicateTable<u32, 1, 6>();
+static constexpr auto REPLICATE_2_BIT_TO_6_TABLE = MakeReplicateTable<u32, 2, 6>();
+static constexpr auto REPLICATE_3_BIT_TO_6_TABLE = MakeReplicateTable<u32, 3, 6>();
+static constexpr auto REPLICATE_4_BIT_TO_6_TABLE = MakeReplicateTable<u32, 4, 6>();
+static constexpr auto REPLICATE_5_BIT_TO_6_TABLE = MakeReplicateTable<u32, 5, 6>();
+static constexpr u32 FastReplicateTo6(u32 value, u32 num_bits) {
+ switch (num_bits) {
+ case 1:
+ return REPLICATE_1_BIT_TO_6_TABLE[value];
+ case 2:
+ return REPLICATE_2_BIT_TO_6_TABLE[value];
+ case 3:
+ return REPLICATE_3_BIT_TO_6_TABLE[value];
+ case 4:
+ return REPLICATE_4_BIT_TO_6_TABLE[value];
+ case 5:
+ return REPLICATE_5_BIT_TO_6_TABLE[value];
+ default:
+ return Replicate(value, num_bits, 6);
+ }
+}
+
class Pixel {
protected:
using ChannelType = s16;
@@ -674,10 +759,10 @@ public:
// significant bits when going from larger to smaller bit depth
// or by repeating the most significant bits when going from
// smaller to larger bit depths.
- void ChangeBitDepth(const u8 (&depth)[4]) {
+ void ChangeBitDepth() {
for (u32 i = 0; i < 4; i++) {
- Component(i) = ChangeBitDepth(Component(i), m_BitDepth[i], depth[i]);
- m_BitDepth[i] = depth[i];
+ Component(i) = ChangeBitDepth(Component(i), m_BitDepth[i]);
+ m_BitDepth[i] = 8;
}
}
@@ -689,28 +774,23 @@ public:
// Changes the bit depth of a single component. See the comment
// above for how we do this.
- static ChannelType ChangeBitDepth(Pixel::ChannelType val, u8 oldDepth, u8 newDepth) {
- assert(newDepth <= 8);
+ static ChannelType ChangeBitDepth(Pixel::ChannelType val, u8 oldDepth) {
assert(oldDepth <= 8);
- if (oldDepth == newDepth) {
+ if (oldDepth == 8) {
// Do nothing
return val;
- } else if (oldDepth == 0 && newDepth != 0) {
- return static_cast<ChannelType>((1 << newDepth) - 1);
- } else if (newDepth > oldDepth) {
- return Replicate(val, oldDepth, newDepth);
+ } else if (oldDepth == 0) {
+ return static_cast<ChannelType>((1 << 8) - 1);
+ } else if (8 > oldDepth) {
+ return static_cast<ChannelType>(FastReplicateTo8(static_cast<u32>(val), oldDepth));
} else {
// oldDepth > newDepth
- if (newDepth == 0) {
- return 0xFF;
- } else {
- u8 bitsWasted = static_cast<u8>(oldDepth - newDepth);
- u16 v = static_cast<u16>(val);
- v = static_cast<u16>((v + (1 << (bitsWasted - 1))) >> bitsWasted);
- v = ::std::min<u16>(::std::max<u16>(0, v), static_cast<u16>((1 << newDepth) - 1));
- return static_cast<u8>(v);
- }
+ const u8 bitsWasted = static_cast<u8>(oldDepth - 8);
+ u16 v = static_cast<u16>(val);
+ v = static_cast<u16>((v + (1 << (bitsWasted - 1))) >> bitsWasted);
+ v = ::std::min<u16>(::std::max<u16>(0, v), static_cast<u16>((1 << 8) - 1));
+ return static_cast<u8>(v);
}
assert(false && "We shouldn't get here.");
@@ -760,8 +840,7 @@ public:
// up in the most-significant byte.
u32 Pack() const {
Pixel eightBit(*this);
- const u8 eightBitDepth[4] = {8, 8, 8, 8};
- eightBit.ChangeBitDepth(eightBitDepth);
+ eightBit.ChangeBitDepth();
u32 r = 0;
r |= eightBit.A();
@@ -816,8 +895,7 @@ static void DecodeColorValues(u32* out, u8* data, const u32* modes, const u32 nP
}
// We now have enough to decode our integer sequence.
- std::vector<IntegerEncodedValue> decodedColorValues;
- decodedColorValues.reserve(32);
+ IntegerEncodedVector decodedColorValues;
InputBitStream colorStream(data);
DecodeIntegerSequence(decodedColorValues, colorStream, range, nValues);
@@ -839,12 +917,12 @@ static void DecodeColorValues(u32* out, u8* data, const u32* modes, const u32 nP
u32 A = 0, B = 0, C = 0, D = 0;
// A is just the lsb replicated 9 times.
- A = Replicate(bitval & 1, 1, 9);
+ A = ReplicateBitTo9(bitval & 1);
switch (val.encoding) {
// Replicate bits
case IntegerEncoding::JustBits:
- out[outIdx++] = Replicate(bitval, bitlen, 8);
+ out[outIdx++] = FastReplicateTo8(bitval, bitlen);
break;
// Use algorithm in C.2.13
@@ -962,13 +1040,13 @@ static u32 UnquantizeTexelWeight(const IntegerEncodedValue& val) {
u32 bitval = val.bit_value;
u32 bitlen = val.num_bits;
- u32 A = Replicate(bitval & 1, 1, 7);
+ u32 A = ReplicateBitTo7(bitval & 1);
u32 B = 0, C = 0, D = 0;
u32 result = 0;
switch (val.encoding) {
case IntegerEncoding::JustBits:
- result = Replicate(bitval, bitlen, 6);
+ result = FastReplicateTo6(bitval, bitlen);
break;
case IntegerEncoding::Trit: {
@@ -1047,7 +1125,7 @@ static u32 UnquantizeTexelWeight(const IntegerEncodedValue& val) {
return result;
}
-static void UnquantizeTexelWeights(u32 out[2][144], const std::vector<IntegerEncodedValue>& weights,
+static void UnquantizeTexelWeights(u32 out[2][144], const IntegerEncodedVector& weights,
const TexelWeightParams& params, const u32 blockWidth,
const u32 blockHeight) {
u32 weightIdx = 0;
@@ -1545,8 +1623,7 @@ static void DecompressBlock(const u8 inBuf[16], const u32 blockWidth, const u32
static_cast<u8>((1 << (weightParams.GetPackedBitSize() % 8)) - 1);
memset(texelWeightData + clearByteStart, 0, 16 - clearByteStart);
- std::vector<IntegerEncodedValue> texelWeightValues;
- texelWeightValues.reserve(64);
+ IntegerEncodedVector texelWeightValues;
InputBitStream weightStream(texelWeightData);
@@ -1568,9 +1645,9 @@ static void DecompressBlock(const u8 inBuf[16], const u32 blockWidth, const u32
Pixel p;
for (u32 c = 0; c < 4; c++) {
u32 C0 = endpos32s[partition][0].Component(c);
- C0 = Replicate(C0, 8, 16);
+ C0 = ReplicateByteTo16(C0);
u32 C1 = endpos32s[partition][1].Component(c);
- C1 = Replicate(C1, 8, 16);
+ C1 = ReplicateByteTo16(C1);
u32 plane = 0;
if (weightParams.m_bDualPlane && (((planeIdx + 1) & 3) == c)) {
diff --git a/src/video_core/textures/texture.h b/src/video_core/textures/texture.h
index 59b8a5e66..eba05aced 100644
--- a/src/video_core/textures/texture.h
+++ b/src/video_core/textures/texture.h
@@ -131,6 +131,20 @@ enum class SwizzleSource : u32 {
OneFloat = 7,
};
+enum class MsaaMode : u32 {
+ Msaa1x1 = 0,
+ Msaa2x1 = 1,
+ Msaa2x2 = 2,
+ Msaa4x2 = 3,
+ Msaa4x2_D3D = 4,
+ Msaa2x1_D3D = 5,
+ Msaa4x4 = 6,
+ Msaa2x2_VC4 = 8,
+ Msaa2x2_VC12 = 9,
+ Msaa4x2_VC8 = 10,
+ Msaa4x2_VC24 = 11,
+};
+
union TextureHandle {
TextureHandle(u32 raw) : raw{raw} {}
@@ -197,6 +211,7 @@ struct TICEntry {
union {
BitField<0, 4, u32> res_min_mip_level;
BitField<4, 4, u32> res_max_mip_level;
+ BitField<8, 4, MsaaMode> msaa_mode;
BitField<12, 12, u32> min_lod_clamp;
};