10 files changed, 230 insertions, 81 deletions
diff --git a/src/video_core/shader/control_flow.cpp b/src/video_core/shader/control_flow.cpp
index e00a3fb70..8d86020f6 100644
--- a/src/video_core/shader/control_flow.cpp
+++ b/src/video_core/shader/control_flow.cpp
@@ -13,6 +13,7 @@
 #include "common/common_types.h"
 #include "video_core/shader/ast.h"
 #include "video_core/shader/control_flow.h"
+#include "video_core/shader/memory_util.h"
 #include "video_core/shader/registry.h"
 #include "video_core/shader/shader_ir.h"
 
@@ -115,17 +116,6 @@ Pred GetPredicate(u32 index, bool negated) {
     return static_cast<Pred>(static_cast<u64>(index) + (negated ? 8ULL : 0ULL));
 }
 
-/**
- * Returns whether the instruction at the specified offset is a 'sched' instruction.
- * Sched instructions always appear before a sequence of 3 instructions.
- */
-constexpr bool IsSchedInstruction(u32 offset, u32 main_offset) {
-    constexpr u32 SchedPeriod = 4;
-    u32 absolute_offset = offset - main_offset;
-
-    return (absolute_offset % SchedPeriod) == 0;
-}
-
 enum class ParseResult : u32 {
     ControlCaught,
     BlockEnd,
diff --git a/src/video_core/shader/decode.cpp b/src/video_core/shader/decode.cpp
index 8427837b7..a75a5cc63 100644
--- a/src/video_core/shader/decode.cpp
+++ b/src/video_core/shader/decode.cpp
@@ -13,6 +13,7 @@
 #include "video_core/engines/shader_bytecode.h"
 #include "video_core/engines/shader_header.h"
 #include "video_core/shader/control_flow.h"
+#include "video_core/shader/memory_util.h"
 #include "video_core/shader/node_helper.h"
 #include "video_core/shader/shader_ir.h"
 
@@ -23,17 +24,6 @@ using Tegra::Shader::OpCode;
 
 namespace {
 
-/**
- * Returns whether the instruction at the specified offset is a 'sched' instruction.
- * Sched instructions always appear before a sequence of 3 instructions.
- */
-constexpr bool IsSchedInstruction(u32 offset, u32 main_offset) {
-    constexpr u32 SchedPeriod = 4;
-    u32 absolute_offset = offset - main_offset;
-
-    return (absolute_offset % SchedPeriod) == 0;
-}
-
 void DeduceTextureHandlerSize(VideoCore::GuestDriverProfile& gpu_driver,
                               const std::list<Sampler>& used_samplers) {
     if (gpu_driver.IsTextureHandlerSizeKnown() || used_samplers.size() <= 1) {
diff --git a/src/video_core/shader/decode/arithmetic_half.cpp b/src/video_core/shader/decode/arithmetic_half.cpp
index ee7d9a29d..a276aee44 100644
--- a/src/video_core/shader/decode/arithmetic_half.cpp
+++ b/src/video_core/shader/decode/arithmetic_half.cpp
@@ -19,22 +19,46 @@ u32 ShaderIR::DecodeArithmeticHalf(NodeBlock& bb, u32 pc) {
     const Instruction instr = {program_code[pc]};
     const auto opcode = OpCode::Decode(instr);
 
-    if (opcode->get().GetId() == OpCode::Id::HADD2_C ||
-        opcode->get().GetId() == OpCode::Id::HADD2_R) {
+    bool negate_a = false;
+    bool negate_b = false;
+    bool absolute_a = false;
+    bool absolute_b = false;
+
+    switch (opcode->get().GetId()) {
+    case OpCode::Id::HADD2_R:
         if (instr.alu_half.ftz == 0) {
             LOG_DEBUG(HW_GPU, "{} without FTZ is not implemented", opcode->get().GetName());
         }
+        negate_a = ((instr.value >> 43) & 1) != 0;
+        negate_b = ((instr.value >> 31) & 1) != 0;
+        absolute_a = ((instr.value >> 44) & 1) != 0;
+        absolute_b = ((instr.value >> 30) & 1) != 0;
+        break;
+    case OpCode::Id::HADD2_C:
+        if (instr.alu_half.ftz == 0) {
+            LOG_DEBUG(HW_GPU, "{} without FTZ is not implemented", opcode->get().GetName());
+        }
+        negate_a = ((instr.value >> 43) & 1) != 0;
+        negate_b = ((instr.value >> 56) & 1) != 0;
+        absolute_a = ((instr.value >> 44) & 1) != 0;
+        absolute_b = ((instr.value >> 54) & 1) != 0;
+        break;
+    case OpCode::Id::HMUL2_R:
+        negate_a = ((instr.value >> 43) & 1) != 0;
+        absolute_a = ((instr.value >> 44) & 1) != 0;
+        absolute_b = ((instr.value >> 30) & 1) != 0;
+        break;
+    case OpCode::Id::HMUL2_C:
+        negate_b = ((instr.value >> 31) & 1) != 0;
+        absolute_a = ((instr.value >> 44) & 1) != 0;
+        absolute_b = ((instr.value >> 54) & 1) != 0;
+        break;
     }
 
-    const bool negate_a =
-        opcode->get().GetId() != OpCode::Id::HMUL2_R && instr.alu_half.negate_a != 0;
-    const bool negate_b =
-        opcode->get().GetId() != OpCode::Id::HMUL2_C && instr.alu_half.negate_b != 0;
-
     Node op_a = UnpackHalfFloat(GetRegister(instr.gpr8), instr.alu_half.type_a);
-    op_a = GetOperandAbsNegHalf(op_a, instr.alu_half.abs_a, negate_a);
+    op_a = GetOperandAbsNegHalf(op_a, absolute_a, negate_a);
 
-    auto [type_b, op_b] = [&]() -> std::tuple<HalfType, Node> {
+    auto [type_b, op_b] = [this, instr, opcode]() -> std::pair<HalfType, Node> {
         switch (opcode->get().GetId()) {
         case OpCode::Id::HADD2_C:
         case OpCode::Id::HMUL2_C:
@@ -48,17 +72,16 @@ u32 ShaderIR::DecodeArithmeticHalf(NodeBlock& bb, u32 pc) {
         }
     }();
     op_b = UnpackHalfFloat(op_b, type_b);
-    // redeclaration to avoid a bug in clang with reusing local bindings in lambdas
-    Node op_b_alt = GetOperandAbsNegHalf(op_b, instr.alu_half.abs_b, negate_b);
+    op_b = GetOperandAbsNegHalf(op_b, absolute_b, negate_b);
 
-    Node value = [&]() {
+    Node value = [this, opcode, op_a, op_b = op_b] {
         switch (opcode->get().GetId()) {
         case OpCode::Id::HADD2_C:
         case OpCode::Id::HADD2_R:
-            return Operation(OperationCode::HAdd, PRECISE, op_a, op_b_alt);
+            return Operation(OperationCode::HAdd, PRECISE, op_a, op_b);
         case OpCode::Id::HMUL2_C:
         case OpCode::Id::HMUL2_R:
-            return Operation(OperationCode::HMul, PRECISE, op_a, op_b_alt);
+            return Operation(OperationCode::HMul, PRECISE, op_a, op_b);
         default:
             UNIMPLEMENTED_MSG("Unhandled half float instruction: {}", opcode->get().GetName());
             return Immediate(0);
diff --git a/src/video_core/shader/decode/arithmetic_integer.cpp b/src/video_core/shader/decode/arithmetic_integer.cpp
index 0f4c3103a..a041519b7 100644
--- a/src/video_core/shader/decode/arithmetic_integer.cpp
+++ b/src/video_core/shader/decode/arithmetic_integer.cpp
@@ -35,15 +35,38 @@ u32 ShaderIR::DecodeArithmeticInteger(NodeBlock& bb, u32 pc) {
     case OpCode::Id::IADD_C:
     case OpCode::Id::IADD_R:
     case OpCode::Id::IADD_IMM: {
-        UNIMPLEMENTED_IF_MSG(instr.alu.saturate_d, "IADD saturation not implemented");
+        UNIMPLEMENTED_IF_MSG(instr.alu.saturate_d, "IADD.SAT");
+        UNIMPLEMENTED_IF_MSG(instr.iadd.x && instr.generates_cc, "IADD.X Rd.CC");
 
         op_a = GetOperandAbsNegInteger(op_a, false, instr.alu_integer.negate_a, true);
         op_b = GetOperandAbsNegInteger(op_b, false, instr.alu_integer.negate_b, true);
 
-        const Node value = Operation(OperationCode::IAdd, PRECISE, op_a, op_b);
+        Node value = Operation(OperationCode::UAdd, op_a, op_b);
 
-        SetInternalFlagsFromInteger(bb, value, instr.generates_cc);
-        SetRegister(bb, instr.gpr0, value);
+        if (instr.iadd.x) {
+            Node carry = GetInternalFlag(InternalFlag::Carry);
+            Node x = Operation(OperationCode::Select, std::move(carry), Immediate(1), Immediate(0));
+            value = Operation(OperationCode::UAdd, std::move(value), std::move(x));
+        }
+
+        if (instr.generates_cc) {
+            const Node i0 = Immediate(0);
+
+            Node zero = Operation(OperationCode::LogicalIEqual, value, i0);
+            Node sign = Operation(OperationCode::LogicalILessThan, value, i0);
+            Node carry = Operation(OperationCode::LogicalAddCarry, op_a, op_b);
+
+            Node pos_a = Operation(OperationCode::LogicalIGreaterThan, op_a, i0);
+            Node pos_b = Operation(OperationCode::LogicalIGreaterThan, op_b, i0);
+            Node pos = Operation(OperationCode::LogicalAnd, std::move(pos_a), std::move(pos_b));
+            Node overflow = Operation(OperationCode::LogicalAnd, pos, sign);
+
+            SetInternalFlag(bb, InternalFlag::Zero, std::move(zero));
+            SetInternalFlag(bb, InternalFlag::Sign, std::move(sign));
+            SetInternalFlag(bb, InternalFlag::Carry, std::move(carry));
+            SetInternalFlag(bb, InternalFlag::Overflow, std::move(overflow));
+        }
+        SetRegister(bb, instr.gpr0, std::move(value));
         break;
     }
     case OpCode::Id::IADD3_C:
@@ -249,8 +272,8 @@ u32 ShaderIR::DecodeArithmeticInteger(NodeBlock& bb, u32 pc) {
             }
             case OpCode::Id::LEA_IMM: {
                 const bool neg = instr.lea.imm.neg != 0;
-                return {Immediate(static_cast<u32>(instr.lea.imm.entry_a)),
-                        GetOperandAbsNegInteger(GetRegister(instr.gpr8), false, neg, true),
+                return {GetOperandAbsNegInteger(GetRegister(instr.gpr8), false, neg, true),
+                        Immediate(static_cast<u32>(instr.lea.imm.entry_a)),
                         Immediate(static_cast<u32>(instr.lea.imm.entry_b))};
             }
             case OpCode::Id::LEA_RZ: {
diff --git a/src/video_core/shader/decode/register_set_predicate.cpp b/src/video_core/shader/decode/register_set_predicate.cpp
index 8d54cce34..6116c31aa 100644
--- a/src/video_core/shader/decode/register_set_predicate.cpp
+++ b/src/video_core/shader/decode/register_set_predicate.cpp
@@ -2,6 +2,8 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
+#include <utility>
+
 #include "common/assert.h"
 #include "common/common_types.h"
 #include "video_core/engines/shader_bytecode.h"
@@ -10,20 +12,20 @@
 
 namespace VideoCommon::Shader {
 
+using std::move;
 using Tegra::Shader::Instruction;
 using Tegra::Shader::OpCode;
 
 namespace {
-constexpr u64 NUM_PROGRAMMABLE_PREDICATES = 7;
-}
+constexpr u64 NUM_CONDITION_CODES = 4;
+constexpr u64 NUM_PREDICATES = 7;
+} // namespace
 
 u32 ShaderIR::DecodeRegisterSetPredicate(NodeBlock& bb, u32 pc) {
     const Instruction instr = {program_code[pc]};
     const auto opcode = OpCode::Decode(instr);
 
-    UNIMPLEMENTED_IF(instr.p2r_r2p.mode != Tegra::Shader::R2pMode::Pr);
-
-    const Node apply_mask = [&] {
+    Node apply_mask = [this, opcode, instr] {
         switch (opcode->get().GetId()) {
         case OpCode::Id::R2P_IMM:
         case OpCode::Id::P2R_IMM:
@@ -34,39 +36,43 @@ u32 ShaderIR::DecodeRegisterSetPredicate(NodeBlock& bb, u32 pc) {
         }
     }();
 
-    const auto offset = static_cast<u32>(instr.p2r_r2p.byte) * 8;
+    const u32 offset = static_cast<u32>(instr.p2r_r2p.byte) * 8;
+
+    const bool cc = instr.p2r_r2p.mode == Tegra::Shader::R2pMode::Cc;
+    const u64 num_entries = cc ? NUM_CONDITION_CODES : NUM_PREDICATES;
+    const auto get_entry = [this, cc](u64 entry) {
+        return cc ? GetInternalFlag(static_cast<InternalFlag>(entry)) : GetPredicate(entry);
+    };
 
     switch (opcode->get().GetId()) {
     case OpCode::Id::R2P_IMM: {
-        const Node mask = GetRegister(instr.gpr8);
+        Node mask = GetRegister(instr.gpr8);
 
-        for (u64 pred = 0; pred < NUM_PROGRAMMABLE_PREDICATES; ++pred) {
-            const auto shift = static_cast<u32>(pred);
+        for (u64 entry = 0; entry < num_entries; ++entry) {
+            const u32 shift = static_cast<u32>(entry);
 
-            const Node apply_compare = BitfieldExtract(apply_mask, shift, 1);
-            const Node condition =
-                Operation(OperationCode::LogicalUNotEqual, apply_compare, Immediate(0));
+            Node apply = BitfieldExtract(apply_mask, shift, 1);
+            Node condition = Operation(OperationCode::LogicalUNotEqual, apply, Immediate(0));
 
-            const Node value_compare = BitfieldExtract(mask, offset + shift, 1);
-            const Node value =
-                Operation(OperationCode::LogicalUNotEqual, value_compare, Immediate(0));
+            Node compare = BitfieldExtract(mask, offset + shift, 1);
+            Node value = Operation(OperationCode::LogicalUNotEqual, move(compare), Immediate(0));
 
-            const Node code = Operation(OperationCode::LogicalAssign, GetPredicate(pred), value);
-            bb.push_back(Conditional(condition, {code}));
+            Node code = Operation(OperationCode::LogicalAssign, get_entry(entry), move(value));
+            bb.push_back(Conditional(condition, {move(code)}));
         }
         break;
     }
     case OpCode::Id::P2R_IMM: {
         Node value = Immediate(0);
-        for (u64 pred = 0; pred < NUM_PROGRAMMABLE_PREDICATES; ++pred) {
-            Node bit = Operation(OperationCode::Select, GetPredicate(pred), Immediate(1U << pred),
+        for (u64 entry = 0; entry < num_entries; ++entry) {
+            Node bit = Operation(OperationCode::Select, get_entry(entry), Immediate(1U << entry),
                                  Immediate(0));
-            value = Operation(OperationCode::UBitwiseOr, std::move(value), std::move(bit));
+            value = Operation(OperationCode::UBitwiseOr, move(value), move(bit));
         }
-        value = Operation(OperationCode::UBitwiseAnd, std::move(value), apply_mask);
-        value = BitfieldInsert(GetRegister(instr.gpr8), std::move(value), offset, 8);
+        value = Operation(OperationCode::UBitwiseAnd, move(value), apply_mask);
+        value = BitfieldInsert(GetRegister(instr.gpr8), move(value), offset, 8);
 
-        SetRegister(bb, instr.gpr0, std::move(value));
+        SetRegister(bb, instr.gpr0, move(value));
         break;
     }
     default:
diff --git a/src/video_core/shader/memory_util.cpp b/src/video_core/shader/memory_util.cpp
new file mode 100644
index 000000000..074f21691
--- /dev/null
+++ b/src/video_core/shader/memory_util.cpp
@@ -0,0 +1,77 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <algorithm>
+#include <cstddef>
+
+#include <boost/container_hash/hash.hpp>
+
+#include "common/common_types.h"
+#include "core/core.h"
+#include "video_core/engines/maxwell_3d.h"
+#include "video_core/memory_manager.h"
+#include "video_core/shader/memory_util.h"
+#include "video_core/shader/shader_ir.h"
+
+namespace VideoCommon::Shader {
+
+GPUVAddr GetShaderAddress(Core::System& system,
+                          Tegra::Engines::Maxwell3D::Regs::ShaderProgram program) {
+    const auto& gpu{system.GPU().Maxwell3D()};
+    const auto& shader_config{gpu.regs.shader_config[static_cast<std::size_t>(program)]};
+    return gpu.regs.code_address.CodeAddress() + shader_config.offset;
+}
+
+bool IsSchedInstruction(std::size_t offset, std::size_t main_offset) {
+    // Sched instructions appear once every 4 instructions.
+    constexpr std::size_t SchedPeriod = 4;
+    const std::size_t absolute_offset = offset - main_offset;
+    return (absolute_offset % SchedPeriod) == 0;
+}
+
+std::size_t CalculateProgramSize(const ProgramCode& program, bool is_compute) {
+    // This is the encoded version of BRA that jumps to itself. All Nvidia
+    // shaders end with one.
+    static constexpr u64 SELF_JUMPING_BRANCH = 0xE2400FFFFF07000FULL;
+    static constexpr u64 MASK = 0xFFFFFFFFFF7FFFFFULL;
+
+    const std::size_t start_offset = is_compute ? KERNEL_MAIN_OFFSET : STAGE_MAIN_OFFSET;
+    std::size_t offset = start_offset;
+    while (offset < program.size()) {
+        const u64 instruction = program[offset];
+        if (!IsSchedInstruction(offset, start_offset)) {
+            if ((instruction & MASK) == SELF_JUMPING_BRANCH) {
+                // End on Maxwell's "nop" instruction
+                break;
+            }
+            if (instruction == 0) {
+                break;
+            }
+        }
+        ++offset;
+    }
+    // The last instruction is included in the program size
+    return std::min(offset + 1, program.size());
+}
+
+ProgramCode GetShaderCode(Tegra::MemoryManager& memory_manager, GPUVAddr gpu_addr,
+                          const u8* host_ptr, bool is_compute) {
+    ProgramCode code(VideoCommon::Shader::MAX_PROGRAM_LENGTH);
+    ASSERT_OR_EXECUTE(host_ptr != nullptr, { return code; });
+    memory_manager.ReadBlockUnsafe(gpu_addr, code.data(), code.size() * sizeof(u64));
+    code.resize(CalculateProgramSize(code, is_compute));
+    return code;
+}
+
+u64 GetUniqueIdentifier(Tegra::Engines::ShaderType shader_type, bool is_a, const ProgramCode& code,
+                        const ProgramCode& code_b) {
+    u64 unique_identifier = boost::hash_value(code);
+    if (is_a) {
+        // VertexA programs include two programs
+        boost::hash_combine(unique_identifier, boost::hash_value(code_b));
+    }
+    return unique_identifier;
+}
+
+} // namespace VideoCommon::Shader
diff --git a/src/video_core/shader/memory_util.h b/src/video_core/shader/memory_util.h
new file mode 100644
index 000000000..be90d24fd
--- /dev/null
+++ b/src/video_core/shader/memory_util.h
@@ -0,0 +1,47 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <cstddef>
+#include <vector>
+
+#include "common/common_types.h"
+#include "video_core/engines/maxwell_3d.h"
+#include "video_core/engines/shader_type.h"
+
+namespace Core {
+class System;
+}
+
+namespace Tegra {
+class MemoryManager;
+}
+
+namespace VideoCommon::Shader {
+
+using ProgramCode = std::vector<u64>;
+
+constexpr u32 STAGE_MAIN_OFFSET = 10;
+constexpr u32 KERNEL_MAIN_OFFSET = 0;
+
+/// Gets the address for the specified shader stage program
+GPUVAddr GetShaderAddress(Core::System& system,
+                          Tegra::Engines::Maxwell3D::Regs::ShaderProgram program);
+
+/// Gets if the current instruction offset is a scheduler instruction
+bool IsSchedInstruction(std::size_t offset, std::size_t main_offset);
+
+/// Calculates the size of a program stream
+std::size_t CalculateProgramSize(const ProgramCode& program, bool is_compute);
+
+/// Gets the shader program code from memory for the specified address
+ProgramCode GetShaderCode(Tegra::MemoryManager& memory_manager, GPUVAddr gpu_addr,
+                          const u8* host_ptr, bool is_compute);
+
+/// Hashes one (or two) program streams
+u64 GetUniqueIdentifier(Tegra::Engines::ShaderType shader_type, bool is_a, const ProgramCode& code,
+                        const ProgramCode& code_b = {});
+
+} // namespace VideoCommon::Shader
diff --git a/src/video_core/shader/node.h b/src/video_core/shader/node.h
index d0656b581..601c822d2 100644
--- a/src/video_core/shader/node.h
+++ b/src/video_core/shader/node.h
@@ -132,6 +132,8 @@ enum class OperationCode {
     LogicalUNotEqual,     /// (uint a, uint b) -> bool
     LogicalUGreaterEqual, /// (uint a, uint b) -> bool
 
+    LogicalAddCarry, /// (uint a, uint b) -> bool
+
     Logical2HLessThan,            /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2
     Logical2HEqual,               /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2
     Logical2HLessEqual,           /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2
diff --git a/src/video_core/shader/shader_ir.h b/src/video_core/shader/shader_ir.h
index 748d73087..15ae152f2 100644
--- a/src/video_core/shader/shader_ir.h
+++ b/src/video_core/shader/shader_ir.h
@@ -18,6 +18,7 @@
 #include "video_core/engines/shader_header.h"
 #include "video_core/shader/ast.h"
 #include "video_core/shader/compiler_settings.h"
+#include "video_core/shader/memory_util.h"
 #include "video_core/shader/node.h"
 #include "video_core/shader/registry.h"
 
@@ -25,8 +26,6 @@ namespace VideoCommon::Shader {
 
 struct ShaderBlock;
 
-using ProgramCode = std::vector<u64>;
-
 constexpr u32 MAX_PROGRAM_LENGTH = 0x1000;
 
 struct ConstBuffer {
diff --git a/src/video_core/shader/track.cpp b/src/video_core/shader/track.cpp
index 513e9bf49..eb97bfd41 100644
--- a/src/video_core/shader/track.cpp
+++ b/src/video_core/shader/track.cpp
@@ -153,21 +153,13 @@ std::tuple<Node, u32, u32> ShaderIR::TrackCbuf(Node tracked, const NodeBlock& co
         if (gpr->GetIndex() == Tegra::Shader::Register::ZeroIndex) {
             return {};
         }
-        s64 current_cursor = cursor;
-        while (current_cursor > 0) {
-            // Reduce the cursor in one to avoid infinite loops when the instruction sets the same
-            // register that it uses as operand
-            const auto [source, new_cursor] = TrackRegister(gpr, code, current_cursor - 1);
-            current_cursor = new_cursor;
-            if (!source) {
-                continue;
-            }
-            const auto [base_address, index, offset] = TrackCbuf(source, code, current_cursor);
-            if (base_address != nullptr) {
-                return {base_address, index, offset};
-            }
+        // Reduce the cursor in one to avoid infinite loops when the instruction sets the same
+        // register that it uses as operand
+        const auto [source, new_cursor] = TrackRegister(gpr, code, cursor - 1);
+        if (!source) {
+            return {};
         }
-        return {};
+        return TrackCbuf(source, code, new_cursor);
     }
     if (const auto operation = std::get_if<OperationNode>(&*tracked)) {
         for (std::size_t i = operation->GetOperandsCount(); i > 0; --i) {