10 files changed, 493 insertions, 138 deletions
diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h
index 771eb5abc..3c869d3a1 100644
--- a/src/video_core/engines/maxwell_3d.h
+++ b/src/video_core/engines/maxwell_3d.h
@@ -679,7 +679,19 @@ public:
 
                 INSERT_PADDING_WORDS(0x7);
 
-                INSERT_PADDING_WORDS(0x46);
+                INSERT_PADDING_WORDS(0x20);
+
+                struct {
+                    u32 is_instanced[NumVertexArrays];
+
+                    /// Returns whether the vertex array specified by index is supposed to be
+                    /// accessed per instance or not.
+                    bool IsInstancingEnabled(u32 index) const {
+                        return is_instanced[index];
+                    }
+                } instanced_arrays;
+
+                INSERT_PADDING_WORDS(0x6);
 
                 Cull cull;
 
@@ -928,6 +940,7 @@ ASSERT_REG_POSITION(point_coord_replace, 0x581);
 ASSERT_REG_POSITION(code_address, 0x582);
 ASSERT_REG_POSITION(draw, 0x585);
 ASSERT_REG_POSITION(index_array, 0x5F2);
+ASSERT_REG_POSITION(instanced_arrays, 0x620);
 ASSERT_REG_POSITION(cull, 0x646);
 ASSERT_REG_POSITION(clear_buffers, 0x674);
 ASSERT_REG_POSITION(query, 0x6C0);
diff --git a/src/video_core/engines/shader_bytecode.h b/src/video_core/engines/shader_bytecode.h
index b038a9d92..3ba6fe614 100644
--- a/src/video_core/engines/shader_bytecode.h
+++ b/src/video_core/engines/shader_bytecode.h
@@ -12,6 +12,7 @@
 
 #include <boost/optional.hpp>
 
+#include "common/assert.h"
 #include "common/bit_field.h"
 #include "common/common_types.h"
 
@@ -79,6 +80,9 @@ union Attribute {
         // shader, and a tuple of (TessCoord.x, TessCoord.y, TessCoord.z, ~) when inside a Tess Eval
         // shader.
         TessCoordInstanceIDVertexID = 47,
+        // This attribute contains a tuple of (Unk, Unk, Unk, gl_FrontFacing) when inside a fragment
+        // shader. It is unknown what the other values contain.
+        FrontFacing = 63,
     };
 
     union {
@@ -214,6 +218,18 @@ enum class FlowCondition : u64 {
     Fcsm_Tr = 0x1C, // TODO(bunnei): What is this used for?
 };
 
+enum class PredicateResultMode : u64 {
+    None = 0x0,
+    NotZero = 0x3,
+};
+
+enum class TextureType : u64 {
+    Texture1D = 0,
+    Texture2D = 1,
+    Texture3D = 2,
+    TextureCube = 3,
+};
+
 union Instruction {
     Instruction& operator=(const Instruction& instr) {
         value = instr.value;
@@ -254,7 +270,7 @@ union Instruction {
             BitField<39, 1, u64> invert_a;
             BitField<40, 1, u64> invert_b;
             BitField<41, 2, LogicOperation> operation;
-            BitField<44, 2, u64> unk44;
+            BitField<44, 2, PredicateResultMode> pred_result_mode;
             BitField<48, 3, Pred> pred48;
         } lop;
 
@@ -284,6 +300,10 @@ union Instruction {
     } alu;
 
     union {
+        BitField<48, 1, u64> negate_b;
+    } fmul;
+
+    union {
         BitField<48, 1, u64> is_signed;
     } shift;
 
@@ -421,6 +441,8 @@ union Instruction {
     } conversion;
 
     union {
+        BitField<28, 1, u64> array;
+        BitField<29, 2, TextureType> texture_type;
         BitField<31, 4, u64> component_mask;
 
         bool IsComponentEnabled(size_t component) const {
@@ -429,29 +451,88 @@ union Instruction {
     } tex;
 
     union {
-        BitField<50, 3, u64> component_mask_selector;
+        BitField<28, 1, u64> array;
+        BitField<29, 2, TextureType> texture_type;
+        BitField<56, 2, u64> component;
+    } tld4;
+
+    union {
+        BitField<52, 2, u64> component;
+    } tld4s;
+
+    union {
         BitField<0, 8, Register> gpr0;
         BitField<28, 8, Register> gpr28;
+        BitField<50, 3, u64> component_mask_selector;
+        BitField<53, 4, u64> texture_info;
+
+        TextureType GetTextureType() const {
+            // The TEXS instruction has a weird encoding for the texture type.
+            if (texture_info == 0)
+                return TextureType::Texture1D;
+            if (texture_info >= 1 && texture_info <= 9)
+                return TextureType::Texture2D;
+            if (texture_info >= 10 && texture_info <= 11)
+                return TextureType::Texture3D;
+            if (texture_info >= 12 && texture_info <= 13)
+                return TextureType::TextureCube;
+
+            UNIMPLEMENTED();
+        }
+
+        bool IsArrayTexture() const {
+            // TEXS only supports Texture2D arrays.
+            return texture_info >= 7 && texture_info <= 9;
+        }
 
         bool HasTwoDestinations() const {
             return gpr28.Value() != Register::ZeroIndex;
         }
 
         bool IsComponentEnabled(size_t component) const {
-            static constexpr std::array<std::array<u32, 8>, 4> mask_lut{
-                {{},
-                 {0x1, 0x2, 0x4, 0x8, 0x3},
-                 {0x1, 0x2, 0x4, 0x8, 0x3, 0x9, 0xa, 0xc},
-                 {0x7, 0xb, 0xd, 0xe, 0xf}}};
+            static constexpr std::array<std::array<u32, 8>, 4> mask_lut{{
+                {},
+                {0x1, 0x2, 0x4, 0x8, 0x3, 0x9, 0xa, 0xc},
+                {0x1, 0x2, 0x4, 0x8, 0x3, 0x9, 0xa, 0xc},
+                {0x7, 0xb, 0xd, 0xe, 0xf},
+            }};
 
             size_t index{gpr0.Value() != Register::ZeroIndex ? 1U : 0U};
             index |= gpr28.Value() != Register::ZeroIndex ? 2 : 0;
 
-            return ((1ull << component) & mask_lut[index][component_mask_selector]) != 0;
+            u32 mask = mask_lut[index][component_mask_selector];
+            // A mask of 0 means this instruction uses an unimplemented mask.
+            ASSERT(mask != 0);
+            return ((1ull << component) & mask) != 0;
         }
     } texs;
 
     union {
+        BitField<53, 4, u64> texture_info;
+
+        TextureType GetTextureType() const {
+            // The TLDS instruction has a weird encoding for the texture type.
+            if (texture_info >= 0 && texture_info <= 1) {
+                return TextureType::Texture1D;
+            }
+            if (texture_info == 2 || texture_info == 8 || texture_info == 12 ||
+                texture_info >= 4 && texture_info <= 6) {
+                return TextureType::Texture2D;
+            }
+            if (texture_info == 7) {
+                return TextureType::Texture3D;
+            }
+
+            UNIMPLEMENTED();
+        }
+
+        bool IsArrayTexture() const {
+            // TEXS only supports Texture2D arrays.
+            return texture_info == 8;
+        }
+    } tlds;
+
+    union {
         BitField<20, 24, u64> target;
         BitField<5, 1, u64> constant_buffer;
 
@@ -513,10 +594,14 @@ public:
         LD_A,
         LD_C,
         ST_A,
+        LDG, // Load from global memory
+        STG, // Store in global memory
         TEX,
-        TEXQ, // Texture Query
-        TEXS, // Texture Fetch with scalar/non-vec4 source/destinations
-        TLDS, // Texture Load with scalar/non-vec4 source/destinations
+        TEXQ,  // Texture Query
+        TEXS,  // Texture Fetch with scalar/non-vec4 source/destinations
+        TLDS,  // Texture Load with scalar/non-vec4 source/destinations
+        TLD4,  // Texture Load 4
+        TLD4S, // Texture Load 4 with scalar / non - vec4 source / destinations
         EXIT,
         IPA,
         FFMA_IMM, // Fused Multiply and Add
@@ -724,10 +809,14 @@ private:
             INST("1110111111011---", Id::LD_A, Type::Memory, "LD_A"),
             INST("1110111110010---", Id::LD_C, Type::Memory, "LD_C"),
             INST("1110111111110---", Id::ST_A, Type::Memory, "ST_A"),
+            INST("1110111011010---", Id::LDG, Type::Memory, "LDG"),
+            INST("1110111011011---", Id::STG, Type::Memory, "STG"),
             INST("110000----111---", Id::TEX, Type::Memory, "TEX"),
             INST("1101111101001---", Id::TEXQ, Type::Memory, "TEXQ"),
             INST("1101100---------", Id::TEXS, Type::Memory, "TEXS"),
             INST("1101101---------", Id::TLDS, Type::Memory, "TLDS"),
+            INST("110010----111---", Id::TLD4, Type::Memory, "TLD4"),
+            INST("1101111100------", Id::TLD4S, Type::Memory, "TLD4S"),
             INST("111000110000----", Id::EXIT, Type::Trivial, "EXIT"),
             INST("11100000--------", Id::IPA, Type::Trivial, "IPA"),
             INST("0011001-1-------", Id::FFMA_IMM, Type::Ffma, "FFMA_IMM"),
diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp
index 5a593c1f7..9758adcfd 100644
--- a/src/video_core/gpu.cpp
+++ b/src/video_core/gpu.cpp
@@ -55,6 +55,7 @@ u32 RenderTargetBytesPerPixel(RenderTargetFormat format) {
     case RenderTargetFormat::RGBA8_UNORM:
     case RenderTargetFormat::RGBA8_SNORM:
     case RenderTargetFormat::RGBA8_SRGB:
+    case RenderTargetFormat::RGBA8_UINT:
     case RenderTargetFormat::RGB10_A2_UNORM:
     case RenderTargetFormat::BGRA8_UNORM:
     case RenderTargetFormat::RG16_UNORM:
diff --git a/src/video_core/gpu.h b/src/video_core/gpu.h
index 97dcccb92..2697e1c27 100644
--- a/src/video_core/gpu.h
+++ b/src/video_core/gpu.h
@@ -30,6 +30,7 @@ enum class RenderTargetFormat : u32 {
     RGBA8_UNORM = 0xD5,
     RGBA8_SRGB = 0xD6,
     RGBA8_SNORM = 0xD7,
+    RGBA8_UINT = 0xD9,
     RG16_UNORM = 0xDA,
     RG16_SNORM = 0xDB,
     RG16_SINT = 0xDC,
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp
index 93eadde7a..fe1f55e85 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -98,7 +98,8 @@ RasterizerOpenGL::~RasterizerOpenGL() {}
 std::pair<u8*, GLintptr> RasterizerOpenGL::SetupVertexArrays(u8* array_ptr,
                                                              GLintptr buffer_offset) {
     MICROPROFILE_SCOPE(OpenGL_VAO);
-    const auto& regs = Core::System::GetInstance().GPU().Maxwell3D().regs;
+    const auto& gpu = Core::System::GetInstance().GPU().Maxwell3D();
+    const auto& regs = gpu.regs;
 
     state.draw.vertex_array = hw_vao.handle;
     state.draw.vertex_buffer = stream_buffer.GetHandle();
@@ -110,9 +111,13 @@ std::pair<u8*, GLintptr> RasterizerOpenGL::SetupVertexArrays(u8* array_ptr,
         if (!vertex_array.IsEnabled())
             continue;
 
-        const Tegra::GPUVAddr start = vertex_array.StartAddress();
+        Tegra::GPUVAddr start = vertex_array.StartAddress();
         const Tegra::GPUVAddr end = regs.vertex_array_limit[index].LimitAddress();
 
+        if (regs.instanced_arrays.IsInstancingEnabled(index) && vertex_array.divisor != 0) {
+            start += vertex_array.stride * (gpu.state.current_instance / vertex_array.divisor);
+        }
+
         ASSERT(end > start);
         u64 size = end - start + 1;
 
@@ -124,7 +129,15 @@ std::pair<u8*, GLintptr> RasterizerOpenGL::SetupVertexArrays(u8* array_ptr,
         glBindVertexBuffer(index, stream_buffer.GetHandle(), vertex_buffer_offset,
                            vertex_array.stride);
 
-        ASSERT_MSG(vertex_array.divisor == 0, "Instanced vertex arrays are not supported");
+        if (regs.instanced_arrays.IsInstancingEnabled(index) && vertex_array.divisor != 0) {
+            // Tell OpenGL that this is an instanced vertex buffer to prevent accessing different
+            // indexes on each vertex. We do the instance indexing manually by incrementing the
+            // start address of the vertex buffer.
+            glVertexBindingDivisor(index, 1);
+        } else {
+            // Disable the vertex buffer instancing.
+            glVertexBindingDivisor(index, 0);
+        }
     }
 
     // Use the vertex array as-is, assumes that the data is formatted correctly for OpenGL.
diff --git a/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp b/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp
index 38aa067b6..fb7476fb8 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp
@@ -94,6 +94,7 @@ struct FormatTuple {
 static constexpr std::array<FormatTuple, SurfaceParams::MaxPixelFormat> tex_format_tuples = {{
     {GL_RGBA8, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, ComponentType::UNorm, false}, // ABGR8U
     {GL_RGBA8, GL_RGBA, GL_BYTE, ComponentType::SNorm, false},                     // ABGR8S
+    {GL_RGBA8UI, GL_RGBA_INTEGER, GL_UNSIGNED_BYTE, ComponentType::UInt, false},   // ABGR8UI
     {GL_RGB, GL_RGB, GL_UNSIGNED_SHORT_5_6_5_REV, ComponentType::UNorm, false},    // B5G6R5U
     {GL_RGB10_A2, GL_RGBA, GL_UNSIGNED_INT_2_10_10_10_REV, ComponentType::UNorm,
      false}, // A2B10G10R10U
@@ -245,6 +246,7 @@ static constexpr std::array<void (*)(u32, u32, u32, std::vector<u8>&, Tegra::GPU
         // clang-format off
         MortonCopy<true, PixelFormat::ABGR8U>,
         MortonCopy<true, PixelFormat::ABGR8S>,
+        MortonCopy<true, PixelFormat::ABGR8UI>,
         MortonCopy<true, PixelFormat::B5G6R5U>,
         MortonCopy<true, PixelFormat::A2B10G10R10U>,
         MortonCopy<true, PixelFormat::A1B5G5R5U>,
@@ -299,6 +301,7 @@ static constexpr std::array<void (*)(u32, u32, u32, std::vector<u8>&, Tegra::GPU
         // clang-format off
         MortonCopy<false, PixelFormat::ABGR8U>,
         MortonCopy<false, PixelFormat::ABGR8S>,
+        MortonCopy<false, PixelFormat::ABGR8UI>,
         MortonCopy<false, PixelFormat::B5G6R5U>,
         MortonCopy<false, PixelFormat::A2B10G10R10U>,
         MortonCopy<false, PixelFormat::A1B5G5R5U>,
diff --git a/src/video_core/renderer_opengl/gl_rasterizer_cache.h b/src/video_core/renderer_opengl/gl_rasterizer_cache.h
index beec01746..fc8b44219 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer_cache.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer_cache.h
@@ -25,59 +25,60 @@ struct SurfaceParams {
     enum class PixelFormat {
         ABGR8U = 0,
         ABGR8S = 1,
-        B5G6R5U = 2,
-        A2B10G10R10U = 3,
-        A1B5G5R5U = 4,
-        R8U = 5,
-        R8UI = 6,
-        RGBA16F = 7,
-        RGBA16U = 8,
-        RGBA16UI = 9,
-        R11FG11FB10F = 10,
-        RGBA32UI = 11,
-        DXT1 = 12,
-        DXT23 = 13,
-        DXT45 = 14,
-        DXN1 = 15, // This is also known as BC4
-        DXN2UNORM = 16,
-        DXN2SNORM = 17,
-        BC7U = 18,
-        ASTC_2D_4X4 = 19,
-        G8R8U = 20,
-        G8R8S = 21,
-        BGRA8 = 22,
-        RGBA32F = 23,
-        RG32F = 24,
-        R32F = 25,
-        R16F = 26,
-        R16U = 27,
-        R16S = 28,
-        R16UI = 29,
-        R16I = 30,
-        RG16 = 31,
-        RG16F = 32,
-        RG16UI = 33,
-        RG16I = 34,
-        RG16S = 35,
-        RGB32F = 36,
-        SRGBA8 = 37,
-        RG8U = 38,
-        RG8S = 39,
-        RG32UI = 40,
-        R32UI = 41,
+        ABGR8UI = 2,
+        B5G6R5U = 3,
+        A2B10G10R10U = 4,
+        A1B5G5R5U = 5,
+        R8U = 6,
+        R8UI = 7,
+        RGBA16F = 8,
+        RGBA16U = 9,
+        RGBA16UI = 10,
+        R11FG11FB10F = 11,
+        RGBA32UI = 12,
+        DXT1 = 13,
+        DXT23 = 14,
+        DXT45 = 15,
+        DXN1 = 16, // This is also known as BC4
+        DXN2UNORM = 17,
+        DXN2SNORM = 18,
+        BC7U = 19,
+        ASTC_2D_4X4 = 20,
+        G8R8U = 21,
+        G8R8S = 22,
+        BGRA8 = 23,
+        RGBA32F = 24,
+        RG32F = 25,
+        R32F = 26,
+        R16F = 27,
+        R16U = 28,
+        R16S = 29,
+        R16UI = 30,
+        R16I = 31,
+        RG16 = 32,
+        RG16F = 33,
+        RG16UI = 34,
+        RG16I = 35,
+        RG16S = 36,
+        RGB32F = 37,
+        SRGBA8 = 38,
+        RG8U = 39,
+        RG8S = 40,
+        RG32UI = 41,
+        R32UI = 42,
 
         MaxColorFormat,
 
         // Depth formats
-        Z32F = 42,
-        Z16 = 43,
+        Z32F = 43,
+        Z16 = 44,
 
         MaxDepthFormat,
 
         // DepthStencil formats
-        Z24S8 = 44,
-        S8Z24 = 45,
-        Z32FS8 = 46,
+        Z24S8 = 45,
+        S8Z24 = 46,
+        Z32FS8 = 47,
 
         MaxDepthStencilFormat,
 
@@ -117,6 +118,7 @@ struct SurfaceParams {
         constexpr std::array<u32, MaxPixelFormat> compression_factor_table = {{
             1, // ABGR8U
             1, // ABGR8S
+            1, // ABGR8UI
             1, // B5G6R5U
             1, // A2B10G10R10U
             1, // A1B5G5R5U
@@ -175,6 +177,7 @@ struct SurfaceParams {
         constexpr std::array<u32, MaxPixelFormat> bpp_table = {{
             32,  // ABGR8U
             32,  // ABGR8S
+            32,  // ABGR8UI
             16,  // B5G6R5U
             32,  // A2B10G10R10U
             16,  // A1B5G5R5U
@@ -257,6 +260,8 @@ struct SurfaceParams {
             return PixelFormat::ABGR8U;
         case Tegra::RenderTargetFormat::RGBA8_SNORM:
             return PixelFormat::ABGR8S;
+        case Tegra::RenderTargetFormat::RGBA8_UINT:
+            return PixelFormat::ABGR8UI;
         case Tegra::RenderTargetFormat::BGRA8_UNORM:
             return PixelFormat::BGRA8;
         case Tegra::RenderTargetFormat::RGB10_A2_UNORM:
@@ -327,6 +332,8 @@ struct SurfaceParams {
                 return PixelFormat::ABGR8U;
             case Tegra::Texture::ComponentType::SNORM:
                 return PixelFormat::ABGR8S;
+            case Tegra::Texture::ComponentType::UINT:
+                return PixelFormat::ABGR8UI;
             }
             LOG_CRITICAL(HW_GPU, "Unimplemented component_type={}",
                          static_cast<u32>(component_type));
@@ -551,6 +558,7 @@ struct SurfaceParams {
         case Tegra::RenderTargetFormat::R16_UINT:
         case Tegra::RenderTargetFormat::RG32_UINT:
         case Tegra::RenderTargetFormat::R32_UINT:
+        case Tegra::RenderTargetFormat::RGBA8_UINT:
             return ComponentType::UInt;
         case Tegra::RenderTargetFormat::RG16_SINT:
         case Tegra::RenderTargetFormat::R16_SINT:
diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
index bb01b3c27..ac6ccfec7 100644
--- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
@@ -439,13 +439,12 @@ public:
         }
         declarations.AddNewLine();
 
-        // Append the sampler2D array for the used textures.
-        size_t num_samplers = GetSamplers().size();
-        if (num_samplers > 0) {
-            declarations.AddLine("uniform sampler2D " + SamplerEntry::GetArrayName(stage) + '[' +
-                                 std::to_string(num_samplers) + "];");
-            declarations.AddNewLine();
+        const auto& samplers = GetSamplers();
+        for (const auto& sampler : samplers) {
+            declarations.AddLine("uniform " + sampler.GetTypeString() + ' ' + sampler.GetName() +
+                                 ';');
         }
+        declarations.AddNewLine();
     }
 
     /// Returns a list of constant buffer declarations
@@ -457,13 +456,14 @@ public:
     }
 
     /// Returns a list of samplers used in the shader
-    std::vector<SamplerEntry> GetSamplers() const {
+    const std::vector<SamplerEntry>& GetSamplers() const {
         return used_samplers;
     }
 
     /// Returns the GLSL sampler used for the input shader sampler, and creates a new one if
     /// necessary.
-    std::string AccessSampler(const Sampler& sampler) {
+    std::string AccessSampler(const Sampler& sampler, Tegra::Shader::TextureType type,
+                              bool is_array) {
         size_t offset = static_cast<size_t>(sampler.index.Value());
 
         // If this sampler has already been used, return the existing mapping.
@@ -472,12 +472,13 @@ public:
                          [&](const SamplerEntry& entry) { return entry.GetOffset() == offset; });
 
         if (itr != used_samplers.end()) {
+            ASSERT(itr->GetType() == type && itr->IsArray() == is_array);
             return itr->GetName();
         }
 
         // Otherwise create a new mapping for this sampler
         size_t next_index = used_samplers.size();
-        SamplerEntry entry{stage, offset, next_index};
+        SamplerEntry entry{stage, offset, next_index, type, is_array};
         used_samplers.emplace_back(entry);
         return entry.GetName();
     }
@@ -542,6 +543,10 @@ private:
             // shader.
             ASSERT(stage == Maxwell3D::Regs::ShaderStage::Vertex);
             return "vec4(0, 0, uintBitsToFloat(instance_id.x), uintBitsToFloat(gl_VertexID))";
+        case Attribute::Index::FrontFacing:
+            // TODO(Subv): Find out what the values are for the other elements.
+            ASSERT(stage == Maxwell3D::Regs::ShaderStage::Fragment);
+            return "vec4(0, 0, 0, uintBitsToFloat(gl_FrontFacing ? 1 : 0))";
         default:
             const u32 index{static_cast<u32>(attribute) -
                             static_cast<u32>(Attribute::Index::Attribute_0)};
@@ -634,8 +639,8 @@ private:
     }
 
     /// Generates code representing a texture sampler.
-    std::string GetSampler(const Sampler& sampler) {
-        return regs.AccessSampler(sampler);
+    std::string GetSampler(const Sampler& sampler, Tegra::Shader::TextureType type, bool is_array) {
+        return regs.AccessSampler(sampler, type, is_array);
     }
 
     /**
@@ -743,6 +748,30 @@ private:
         return op->second;
     }
 
+    /**
+     * Transforms the input string GLSL operand into one that applies the abs() function and negates
+     * the output if necessary. When both abs and neg are true, the negation will be applied after
+     * taking the absolute value.
+     * @param operand The input operand to take the abs() of, negate, or both.
+     * @param abs Whether to apply the abs() function to the input operand.
+     * @param neg Whether to negate the input operand.
+     * @returns String corresponding to the operand after being transformed by the abs() and
+     * negation operations.
+     */
+    static std::string GetOperandAbsNeg(const std::string& operand, bool abs, bool neg) {
+        std::string result = operand;
+
+        if (abs) {
+            result = "abs(" + result + ')';
+        }
+
+        if (neg) {
+            result = "-(" + result + ')';
+        }
+
+        return result;
+    }
+
     /*
      * Returns whether the instruction at the specified offset is a 'sched' instruction.
      * Sched instructions always appear before a sequence of 3 instructions.
@@ -756,28 +785,51 @@ private:
     }
 
     void WriteLogicOperation(Register dest, LogicOperation logic_op, const std::string& op_a,
-                             const std::string& op_b) {
+                             const std::string& op_b,
+                             Tegra::Shader::PredicateResultMode predicate_mode,
+                             Tegra::Shader::Pred predicate) {
+        std::string result{};
         switch (logic_op) {
         case LogicOperation::And: {
-            regs.SetRegisterToInteger(dest, true, 0, '(' + op_a + " & " + op_b + ')', 1, 1);
+            result = '(' + op_a + " & " + op_b + ')';
             break;
         }
         case LogicOperation::Or: {
-            regs.SetRegisterToInteger(dest, true, 0, '(' + op_a + " | " + op_b + ')', 1, 1);
+            result = '(' + op_a + " | " + op_b + ')';
             break;
         }
         case LogicOperation::Xor: {
-            regs.SetRegisterToInteger(dest, true, 0, '(' + op_a + " ^ " + op_b + ')', 1, 1);
+            result = '(' + op_a + " ^ " + op_b + ')';
             break;
         }
         case LogicOperation::PassB: {
-            regs.SetRegisterToInteger(dest, true, 0, op_b, 1, 1);
+            result = op_b;
             break;
         }
         default:
             LOG_CRITICAL(HW_GPU, "Unimplemented logic operation: {}", static_cast<u32>(logic_op));
             UNREACHABLE();
         }
+
+        if (dest != Tegra::Shader::Register::ZeroIndex) {
+            regs.SetRegisterToInteger(dest, true, 0, result, 1, 1);
+        }
+
+        using Tegra::Shader::PredicateResultMode;
+        // Write the predicate value depending on the predicate mode.
+        switch (predicate_mode) {
+        case PredicateResultMode::None:
+            // Do nothing.
+            return;
+        case PredicateResultMode::NotZero:
+            // Set the predicate to true if the result is not zero.
+            SetPredicate(static_cast<u64>(predicate), '(' + result + ") != 0");
+            break;
+        default:
+            LOG_CRITICAL(HW_GPU, "Unimplemented predicate result mode: {}",
+                         static_cast<u32>(predicate_mode));
+            UNREACHABLE();
+        }
     }
 
     void WriteTexsInstruction(const Instruction& instr, const std::string& coord,
@@ -788,29 +840,56 @@ private:
         ++shader.scope;
         shader.AddLine(coord);
 
-        // TEXS has two destination registers. RG goes into gpr0+0 and gpr0+1, and BA
-        // goes into gpr28+0 and gpr28+1
-        size_t texs_offset{};
-
-        size_t src_elem{};
-        for (const auto& dest : {instr.gpr0.Value(), instr.gpr28.Value()}) {
-            size_t dest_elem{};
-            for (unsigned elem = 0; elem < 2; ++elem) {
-                if (!instr.texs.IsComponentEnabled(src_elem++)) {
-                    // Skip disabled components
-                    continue;
-                }
-                regs.SetRegisterToFloat(dest, elem + texs_offset, texture, 1, 4, false,
-                                        dest_elem++);
+        // TEXS has two destination registers and a swizzle. The first two elements in the swizzle
+        // go into gpr0+0 and gpr0+1, and the rest goes into gpr28+0 and gpr28+1
+
+        size_t written_components = 0;
+        for (u32 component = 0; component < 4; ++component) {
+            if (!instr.texs.IsComponentEnabled(component)) {
+                continue;
             }
 
-            if (!instr.texs.HasTwoDestinations()) {
-                // Skip the second destination
-                break;
+            if (written_components < 2) {
+                // Write the first two swizzle components to gpr0 and gpr0+1
+                regs.SetRegisterToFloat(instr.gpr0, component, texture, 1, 4, false,
+                                        written_components % 2);
+            } else {
+                ASSERT(instr.texs.HasTwoDestinations());
+                // Write the rest of the swizzle components to gpr28 and gpr28+1
+                regs.SetRegisterToFloat(instr.gpr28, component, texture, 1, 4, false,
+                                        written_components % 2);
             }
 
-            texs_offset += 2;
+            ++written_components;
         }
+
+        --shader.scope;
+        shader.AddLine('}');
+    }
+
+    /*
+     * Emits code to push the input target address to the SSY address stack, incrementing the stack
+     * top.
+     */
+    void EmitPushToSSYStack(u32 target) {
+        shader.AddLine('{');
+        ++shader.scope;
+        shader.AddLine("ssy_stack[ssy_stack_top] = " + std::to_string(target) + "u;");
+        shader.AddLine("ssy_stack_top++;");
+        --shader.scope;
+        shader.AddLine('}');
+    }
+
+    /*
+     * Emits code to pop an address from the SSY address stack, setting the jump address to the
+     * popped address and decrementing the stack top.
+     */
+    void EmitPopFromSSYStack() {
+        shader.AddLine('{');
+        ++shader.scope;
+        shader.AddLine("ssy_stack_top--;");
+        shader.AddLine("jmp_to = ssy_stack[ssy_stack_top];");
+        shader.AddLine("break;");
         --shader.scope;
         shader.AddLine('}');
     }
@@ -859,13 +938,6 @@ private:
         switch (opcode->GetType()) {
         case OpCode::Type::Arithmetic: {
             std::string op_a = regs.GetRegisterAsFloat(instr.gpr8);
-            if (instr.alu.abs_a) {
-                op_a = "abs(" + op_a + ')';
-            }
-
-            if (instr.alu.negate_a) {
-                op_a = "-(" + op_a + ')';
-            }
 
             std::string op_b;
 
@@ -880,17 +952,10 @@ private:
                 }
             }
 
-            if (instr.alu.abs_b) {
-                op_b = "abs(" + op_b + ')';
-            }
-
-            if (instr.alu.negate_b) {
-                op_b = "-(" + op_b + ')';
-            }
-
             switch (opcode->GetId()) {
             case OpCode::Id::MOV_C:
             case OpCode::Id::MOV_R: {
+                // MOV does not have neither 'abs' nor 'neg' bits.
                 regs.SetRegisterToFloat(instr.gpr0, 0, op_b, 1, 1);
                 break;
             }
@@ -898,6 +963,8 @@ private:
             case OpCode::Id::FMUL_C:
             case OpCode::Id::FMUL_R:
             case OpCode::Id::FMUL_IMM: {
+                // FMUL does not have 'abs' bits and only the second operand has a 'neg' bit.
+                op_b = GetOperandAbsNeg(op_b, false, instr.fmul.negate_b);
                 regs.SetRegisterToFloat(instr.gpr0, 0, op_a + " * " + op_b, 1, 1,
                                         instr.alu.saturate_d);
                 break;
@@ -905,11 +972,14 @@ private:
             case OpCode::Id::FADD_C:
             case OpCode::Id::FADD_R:
             case OpCode::Id::FADD_IMM: {
+                op_a = GetOperandAbsNeg(op_a, instr.alu.abs_a, instr.alu.negate_a);
+                op_b = GetOperandAbsNeg(op_b, instr.alu.abs_b, instr.alu.negate_b);
                 regs.SetRegisterToFloat(instr.gpr0, 0, op_a + " + " + op_b, 1, 1,
                                         instr.alu.saturate_d);
                 break;
             }
             case OpCode::Id::MUFU: {
+                op_a = GetOperandAbsNeg(op_a, instr.alu.abs_a, instr.alu.negate_a);
                 switch (instr.sub_op) {
                 case SubOp::Cos:
                     regs.SetRegisterToFloat(instr.gpr0, 0, "cos(" + op_a + ')', 1, 1,
@@ -949,6 +1019,9 @@ private:
             case OpCode::Id::FMNMX_C:
             case OpCode::Id::FMNMX_R:
             case OpCode::Id::FMNMX_IMM: {
+                op_a = GetOperandAbsNeg(op_a, instr.alu.abs_a, instr.alu.negate_a);
+                op_b = GetOperandAbsNeg(op_b, instr.alu.abs_b, instr.alu.negate_b);
+
                 std::string condition =
                     GetPredicateCondition(instr.alu.fmnmx.pred, instr.alu.fmnmx.negate_pred != 0);
                 std::string parameters = op_a + ',' + op_b;
@@ -962,7 +1035,7 @@ private:
             case OpCode::Id::RRO_R:
             case OpCode::Id::RRO_IMM: {
                 // Currently RRO is only implemented as a register move.
-                // Usage of `abs_b` and `negate_b` here should also be correct.
+                op_b = GetOperandAbsNeg(op_b, instr.alu.abs_b, instr.alu.negate_b);
                 regs.SetRegisterToFloat(instr.gpr0, 0, op_b, 1, 1);
                 LOG_WARNING(HW_GPU, "RRO instruction is incomplete");
                 break;
@@ -1099,7 +1172,9 @@ private:
                 if (instr.alu.lop32i.invert_b)
                     op_b = "~(" + op_b + ')';
 
-                WriteLogicOperation(instr.gpr0, instr.alu.lop32i.operation, op_a, op_b);
+                WriteLogicOperation(instr.gpr0, instr.alu.lop32i.operation, op_a, op_b,
+                                    Tegra::Shader::PredicateResultMode::None,
+                                    Tegra::Shader::Pred::UnusedIndex);
                 break;
             }
             default: {
@@ -1165,16 +1240,14 @@ private:
             case OpCode::Id::LOP_C:
             case OpCode::Id::LOP_R:
             case OpCode::Id::LOP_IMM: {
-                ASSERT_MSG(!instr.alu.lop.unk44, "Unimplemented");
-                ASSERT_MSG(instr.alu.lop.pred48 == Pred::UnusedIndex, "Unimplemented");
-
                 if (instr.alu.lop.invert_a)
                     op_a = "~(" + op_a + ')';
 
                 if (instr.alu.lop.invert_b)
                     op_b = "~(" + op_b + ')';
 
-                WriteLogicOperation(instr.gpr0, instr.alu.lop.operation, op_a, op_b);
+                WriteLogicOperation(instr.gpr0, instr.alu.lop.operation, op_a, op_b,
+                                    instr.alu.lop.pred_result_mode, instr.alu.lop.pred48);
                 break;
             }
             case OpCode::Id::IMNMX_C:
@@ -1239,8 +1312,6 @@ private:
             break;
         }
         case OpCode::Type::Conversion: {
-            ASSERT_MSG(!instr.conversion.negate_a, "Unimplemented");
-
             switch (opcode->GetId()) {
             case OpCode::Id::I2I_R: {
                 ASSERT_MSG(!instr.conversion.selector, "Unimplemented");
@@ -1437,10 +1508,29 @@ private:
                 break;
             }
             case OpCode::Id::TEX: {
-                const std::string op_a = regs.GetRegisterAsFloat(instr.gpr8);
-                const std::string op_b = regs.GetRegisterAsFloat(instr.gpr8.Value() + 1);
-                const std::string sampler = GetSampler(instr.sampler);
-                const std::string coord = "vec2 coords = vec2(" + op_a + ", " + op_b + ");";
+                ASSERT_MSG(instr.tex.array == 0, "TEX arrays unimplemented");
+                std::string coord{};
+
+                switch (instr.tex.texture_type) {
+                case Tegra::Shader::TextureType::Texture2D: {
+                    std::string x = regs.GetRegisterAsFloat(instr.gpr8);
+                    std::string y = regs.GetRegisterAsFloat(instr.gpr20);
+                    coord = "vec2 coords = vec2(" + x + ", " + y + ");";
+                    break;
+                }
+                case Tegra::Shader::TextureType::Texture3D: {
+                    std::string x = regs.GetRegisterAsFloat(instr.gpr8);
+                    std::string y = regs.GetRegisterAsFloat(instr.gpr8.Value() + 1);
+                    std::string z = regs.GetRegisterAsFloat(instr.gpr20);
+                    coord = "vec3 coords = vec3(" + x + ", " + y + ", " + z + ");";
+                    break;
+                }
+                default:
+                    UNIMPLEMENTED();
+                }
+
+                const std::string sampler =
+                    GetSampler(instr.sampler, instr.tex.texture_type, instr.tex.array);
                 // Add an extra scope and declare the texture coords inside to prevent
                 // overwriting them in case they are used as outputs of the texs instruction.
                 shader.AddLine("{");
@@ -1462,24 +1552,115 @@ private:
                 break;
             }
             case OpCode::Id::TEXS: {
-                const std::string op_a = regs.GetRegisterAsFloat(instr.gpr8);
-                const std::string op_b = regs.GetRegisterAsFloat(instr.gpr20);
-                const std::string sampler = GetSampler(instr.sampler);
-                const std::string coord = "vec2 coords = vec2(" + op_a + ", " + op_b + ");";
+                std::string coord{};
+
+                switch (instr.texs.GetTextureType()) {
+                case Tegra::Shader::TextureType::Texture2D: {
+                    if (instr.texs.IsArrayTexture()) {
+                        std::string index = regs.GetRegisterAsInteger(instr.gpr8);
+                        std::string x = regs.GetRegisterAsFloat(instr.gpr8.Value() + 1);
+                        std::string y = regs.GetRegisterAsFloat(instr.gpr20);
+                        coord = "vec3 coords = vec3(" + x + ", " + y + ", " + index + ");";
+                    } else {
+                        std::string x = regs.GetRegisterAsFloat(instr.gpr8);
+                        std::string y = regs.GetRegisterAsFloat(instr.gpr20);
+                        coord = "vec2 coords = vec2(" + x + ", " + y + ");";
+                    }
+                    break;
+                }
+                case Tegra::Shader::TextureType::TextureCube: {
+                    std::string x = regs.GetRegisterAsFloat(instr.gpr8);
+                    std::string y = regs.GetRegisterAsFloat(instr.gpr8.Value() + 1);
+                    std::string z = regs.GetRegisterAsFloat(instr.gpr20);
+                    coord = "vec3 coords = vec3(" + x + ", " + y + ", " + z + ");";
+                    break;
+                }
+                default:
+                    UNIMPLEMENTED();
+                }
+                const std::string sampler = GetSampler(instr.sampler, instr.texs.GetTextureType(),
+                                                       instr.texs.IsArrayTexture());
 
                 const std::string texture = "texture(" + sampler + ", coords)";
                 WriteTexsInstruction(instr, coord, texture);
                 break;
             }
             case OpCode::Id::TLDS: {
-                const std::string op_a = regs.GetRegisterAsInteger(instr.gpr8);
-                const std::string op_b = regs.GetRegisterAsInteger(instr.gpr20);
-                const std::string sampler = GetSampler(instr.sampler);
-                const std::string coord = "ivec2 coords = ivec2(" + op_a + ", " + op_b + ");";
+                ASSERT(instr.tlds.GetTextureType() == Tegra::Shader::TextureType::Texture2D);
+                ASSERT(instr.tlds.IsArrayTexture() == false);
+                std::string coord{};
+
+                switch (instr.tlds.GetTextureType()) {
+                case Tegra::Shader::TextureType::Texture2D: {
+                    if (instr.tlds.IsArrayTexture()) {
+                        UNIMPLEMENTED();
+                    } else {
+                        std::string x = regs.GetRegisterAsInteger(instr.gpr8);
+                        std::string y = regs.GetRegisterAsInteger(instr.gpr20);
+                        coord = "ivec2 coords = ivec2(" + x + ", " + y + ");";
+                    }
+                    break;
+                }
+                default:
+                    UNIMPLEMENTED();
+                }
+                const std::string sampler = GetSampler(instr.sampler, instr.tlds.GetTextureType(),
+                                                       instr.tlds.IsArrayTexture());
                 const std::string texture = "texelFetch(" + sampler + ", coords, 0)";
                 WriteTexsInstruction(instr, coord, texture);
                 break;
             }
+            case OpCode::Id::TLD4: {
+                ASSERT(instr.tld4.texture_type == Tegra::Shader::TextureType::Texture2D);
+                ASSERT(instr.tld4.array == 0);
+                std::string coord{};
+
+                switch (instr.tld4.texture_type) {
+                case Tegra::Shader::TextureType::Texture2D: {
+                    std::string x = regs.GetRegisterAsFloat(instr.gpr8);
+                    std::string y = regs.GetRegisterAsFloat(instr.gpr8.Value() + 1);
+                    coord = "vec2 coords = vec2(" + x + ", " + y + ");";
+                    break;
+                }
+                default:
+                    UNIMPLEMENTED();
+                }
+
+                const std::string sampler =
+                    GetSampler(instr.sampler, instr.tld4.texture_type, instr.tld4.array);
+                // Add an extra scope and declare the texture coords inside to prevent
+                // overwriting them in case they are used as outputs of the texs instruction.
+                shader.AddLine("{");
+                ++shader.scope;
+                shader.AddLine(coord);
+                const std::string texture = "textureGather(" + sampler + ", coords, " +
+                                            std::to_string(instr.tld4.component) + ')';
+
+                size_t dest_elem{};
+                for (size_t elem = 0; elem < 4; ++elem) {
+                    if (!instr.tex.IsComponentEnabled(elem)) {
+                        // Skip disabled components
+                        continue;
+                    }
+                    regs.SetRegisterToFloat(instr.gpr0, elem, texture, 1, 4, false, dest_elem);
+                    ++dest_elem;
+                }
+                --shader.scope;
+                shader.AddLine("}");
+                break;
+            }
+            case OpCode::Id::TLD4S: {
+                const std::string op_a = regs.GetRegisterAsFloat(instr.gpr8);
+                const std::string op_b = regs.GetRegisterAsFloat(instr.gpr20);
+                // TODO(Subv): Figure out how the sampler type is encoded in the TLD4S instruction.
+                const std::string sampler =
+                    GetSampler(instr.sampler, Tegra::Shader::TextureType::Texture2D, false);
+                const std::string coord = "vec2 coords = vec2(" + op_a + ", " + op_b + ");";
+                const std::string texture = "textureGather(" + sampler + ", coords, " +
+                                            std::to_string(instr.tld4s.component) + ')';
+                WriteTexsInstruction(instr, coord, texture);
+                break;
+            }
             default: {
                 LOG_CRITICAL(HW_GPU, "Unhandled memory instruction: {}", opcode->GetName());
                 UNREACHABLE();
@@ -1843,13 +2024,13 @@ private:
                 ASSERT_MSG(instr.bra.constant_buffer == 0, "Constant buffer SSY is not supported");
 
                 u32 target = offset + instr.bra.GetBranchTarget();
-                shader.AddLine("ssy_target = " + std::to_string(target) + "u;");
+                EmitPushToSSYStack(target);
                 break;
             }
             case OpCode::Id::SYNC: {
                 // The SYNC opcode jumps to the address previously set by the SSY opcode
                 ASSERT(instr.flow.cond == Tegra::Shader::FlowCondition::Always);
-                shader.AddLine("{ jmp_to = ssy_target; break; }");
+                EmitPopFromSSYStack();
                 break;
             }
             case OpCode::Id::DEPBAR: {
@@ -1920,7 +2101,13 @@ private:
             } else {
                 labels.insert(subroutine.begin);
                 shader.AddLine("uint jmp_to = " + std::to_string(subroutine.begin) + "u;");
-                shader.AddLine("uint ssy_target = 0u;");
+
+                // TODO(Subv): Figure out the actual depth of the SSY stack, for now it seems
+                // unlikely that shaders will use 20 nested SSYs.
+                constexpr u32 SSY_STACK_SIZE = 20;
+                shader.AddLine("uint ssy_stack[" + std::to_string(SSY_STACK_SIZE) + "];");
+                shader.AddLine("uint ssy_stack_top = 0u;");
+
                 shader.AddLine("while (true) {");
                 ++shader.scope;
 
diff --git a/src/video_core/renderer_opengl/gl_shader_gen.h b/src/video_core/renderer_opengl/gl_shader_gen.h
index 4729ce0fc..db48da645 100644
--- a/src/video_core/renderer_opengl/gl_shader_gen.h
+++ b/src/video_core/renderer_opengl/gl_shader_gen.h
@@ -11,6 +11,7 @@
 #include <vector>
 #include "common/common_types.h"
 #include "common/hash.h"
+#include "video_core/engines/shader_bytecode.h"
 
 namespace GLShader {
 
@@ -72,8 +73,9 @@ class SamplerEntry {
     using Maxwell = Tegra::Engines::Maxwell3D::Regs;
 
 public:
-    SamplerEntry(Maxwell::ShaderStage stage, size_t offset, size_t index)
-        : offset(offset), stage(stage), sampler_index(index) {}
+    SamplerEntry(Maxwell::ShaderStage stage, size_t offset, size_t index,
+                 Tegra::Shader::TextureType type, bool is_array)
+        : offset(offset), stage(stage), sampler_index(index), type(type), is_array(is_array) {}
 
     size_t GetOffset() const {
         return offset;
@@ -88,8 +90,41 @@ public:
     }
 
     std::string GetName() const {
-        return std::string(TextureSamplerNames[static_cast<size_t>(stage)]) + '[' +
-               std::to_string(sampler_index) + ']';
+        return std::string(TextureSamplerNames[static_cast<size_t>(stage)]) + '_' +
+               std::to_string(sampler_index);
+    }
+
+    std::string GetTypeString() const {
+        using Tegra::Shader::TextureType;
+        std::string glsl_type;
+
+        switch (type) {
+        case TextureType::Texture1D:
+            glsl_type = "sampler1D";
+            break;
+        case TextureType::Texture2D:
+            glsl_type = "sampler2D";
+            break;
+        case TextureType::Texture3D:
+            glsl_type = "sampler3D";
+            break;
+        case TextureType::TextureCube:
+            glsl_type = "samplerCube";
+            break;
+        default:
+            UNIMPLEMENTED();
+        }
+        if (is_array)
+            glsl_type += "Array";
+        return glsl_type;
+    }
+
+    Tegra::Shader::TextureType GetType() const {
+        return type;
+    }
+
+    bool IsArray() const {
+        return is_array;
     }
 
     static std::string GetArrayName(Maxwell::ShaderStage stage) {
@@ -100,11 +135,14 @@ private:
     static constexpr std::array<const char*, Maxwell::MaxShaderStage> TextureSamplerNames = {
         "tex_vs", "tex_tessc", "tex_tesse", "tex_gs", "tex_fs",
     };
+
     /// Offset in TSC memory from which to read the sampler object, as specified by the sampling
     /// instruction.
     size_t offset;
-    Maxwell::ShaderStage stage; ///< Shader stage where this sampler was used.
-    size_t sampler_index;       ///< Value used to index into the generated GLSL sampler array.
+    Maxwell::ShaderStage stage;      ///< Shader stage where this sampler was used.
+    size_t sampler_index;            ///< Value used to index into the generated GLSL sampler array.
+    Tegra::Shader::TextureType type; ///< The type used to sample this texture (Texture2D, etc)
+    bool is_array; ///< Whether the texture is being sampled as an array texture or not.
 };
 
 struct ShaderEntries {
diff --git a/src/video_core/renderer_opengl/maxwell_to_gl.h b/src/video_core/renderer_opengl/maxwell_to_gl.h
index 8f719fdd8..5d91a0c2f 100644
--- a/src/video_core/renderer_opengl/maxwell_to_gl.h
+++ b/src/video_core/renderer_opengl/maxwell_to_gl.h
@@ -147,6 +147,8 @@ inline GLenum WrapMode(Tegra::Texture::WrapMode wrap_mode) {
         // GL_CLAMP_TO_BORDER to get the border color of the texture, and then sample the edge to
         // manually mix them. However the shader part of this is not yet implemented.
         return GL_CLAMP_TO_BORDER;
+    case Tegra::Texture::WrapMode::MirrorOnceClampToEdge:
+        return GL_MIRROR_CLAMP_TO_EDGE;
     }
     LOG_CRITICAL(Render_OpenGL, "Unimplemented texture wrap mode={}", static_cast<u32>(wrap_mode));
     UNREACHABLE();